LLVM 22.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
67#include "llvm/IR/Attributes.h"
68#include "llvm/IR/CallingConv.h"
69#include "llvm/IR/Constant.h"
70#include "llvm/IR/Constants.h"
71#include "llvm/IR/DataLayout.h"
72#include "llvm/IR/DebugLoc.h"
74#include "llvm/IR/Function.h"
75#include "llvm/IR/GlobalAlias.h"
76#include "llvm/IR/GlobalValue.h"
78#include "llvm/IR/IRBuilder.h"
79#include "llvm/IR/InlineAsm.h"
80#include "llvm/IR/Instruction.h"
83#include "llvm/IR/Intrinsics.h"
84#include "llvm/IR/IntrinsicsARM.h"
85#include "llvm/IR/Module.h"
86#include "llvm/IR/Type.h"
87#include "llvm/IR/User.h"
88#include "llvm/IR/Value.h"
89#include "llvm/MC/MCInstrDesc.h"
91#include "llvm/MC/MCSchedule.h"
98#include "llvm/Support/Debug.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <limits>
112#include <optional>
113#include <tuple>
114#include <utility>
115#include <vector>
116
117using namespace llvm;
118
119#define DEBUG_TYPE "arm-isel"
120
121STATISTIC(NumTailCalls, "Number of tail calls");
122STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
123STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
124STATISTIC(NumConstpoolPromoted,
125 "Number of constants with their storage promoted into constant pools");
126
127static cl::opt<bool>
128ARMInterworking("arm-interworking", cl::Hidden,
129 cl::desc("Enable / disable ARM interworking (for debugging only)"),
130 cl::init(true));
131
133 "arm-promote-constant", cl::Hidden,
134 cl::desc("Enable / disable promotion of unnamed_addr constants into "
135 "constant pools"),
136 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
138 "arm-promote-constant-max-size", cl::Hidden,
139 cl::desc("Maximum size of constant to promote into a constant pool"),
140 cl::init(64));
142 "arm-promote-constant-max-total", cl::Hidden,
143 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
144 cl::init(128));
145
147MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
148 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
149 cl::init(2));
150
152 "arm-max-base-updates-to-check", cl::Hidden,
153 cl::desc("Maximum number of base-updates to check generating postindex."),
154 cl::init(64));
155
156/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
157constexpr MVT FlagsVT = MVT::i32;
158
159// The APCS parameter registers.
160static const MCPhysReg GPRArgRegs[] = {
161 ARM::R0, ARM::R1, ARM::R2, ARM::R3
162};
163
165 SelectionDAG &DAG, const SDLoc &DL) {
167 assert(Arg.ArgVT.bitsLT(MVT::i32));
168 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
169 SDValue Ext =
171 MVT::i32, Trunc);
172 return Ext;
173}
174
175void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
176 if (VT != PromotedLdStVT) {
177 setOperationAction(ISD::LOAD, VT, Promote);
178 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
179
180 setOperationAction(ISD::STORE, VT, Promote);
181 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
182 }
183
184 MVT ElemTy = VT.getVectorElementType();
185 if (ElemTy != MVT::f64)
189 if (ElemTy == MVT::i32) {
194 } else {
199 }
208 if (VT.isInteger()) {
212 }
213
214 // Neon does not support vector divide/remainder operations.
223
224 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
225 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
227 setOperationAction(Opcode, VT, Legal);
228 if (!VT.isFloatingPoint())
229 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
230 setOperationAction(Opcode, VT, Legal);
231}
232
233void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
234 addRegisterClass(VT, &ARM::DPRRegClass);
235 addTypeForNEON(VT, MVT::f64);
236}
237
238void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
239 addRegisterClass(VT, &ARM::DPairRegClass);
240 addTypeForNEON(VT, MVT::v2f64);
241}
242
243void ARMTargetLowering::setAllExpand(MVT VT) {
244 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
246
247 // We support these really simple operations even on types where all
248 // the actual arithmetic has to be broken down into simpler
249 // operations or turned into library calls.
250 setOperationAction(ISD::BITCAST, VT, Legal);
251 setOperationAction(ISD::LOAD, VT, Legal);
252 setOperationAction(ISD::STORE, VT, Legal);
254}
255
256void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
257 LegalizeAction Action) {
258 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
259 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
260 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
261}
262
263void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
264 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
265
266 for (auto VT : IntTypes) {
267 addRegisterClass(VT, &ARM::MQPRRegClass);
281 setOperationAction(ISD::MLOAD, VT, Custom);
282 setOperationAction(ISD::MSTORE, VT, Legal);
297
298 // No native support for these.
308
309 // Vector reductions
310 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
311 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
312 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
313 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
314 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
315 setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
316 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
317 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
318 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
319
320 if (!HasMVEFP) {
325 } else {
328 }
329
330 // Pre and Post inc are supported on loads and stores
331 for (unsigned im = (unsigned)ISD::PRE_INC;
332 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
337 }
338 }
339
340 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
341 for (auto VT : FloatTypes) {
342 addRegisterClass(VT, &ARM::MQPRRegClass);
343 if (!HasMVEFP)
344 setAllExpand(VT);
345
346 // These are legal or custom whether we have MVE.fp or not
355 setOperationAction(ISD::MLOAD, VT, Custom);
356 setOperationAction(ISD::MSTORE, VT, Legal);
359
360 // Pre and Post inc are supported on loads and stores
361 for (unsigned im = (unsigned)ISD::PRE_INC;
362 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
367 }
368
369 if (HasMVEFP) {
370 setOperationAction(ISD::FMINNUM, VT, Legal);
371 setOperationAction(ISD::FMAXNUM, VT, Legal);
372 setOperationAction(ISD::FROUND, VT, Legal);
373 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
374 setOperationAction(ISD::FRINT, VT, Legal);
375 setOperationAction(ISD::FTRUNC, VT, Legal);
376 setOperationAction(ISD::FFLOOR, VT, Legal);
377 setOperationAction(ISD::FCEIL, VT, Legal);
378 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
379 setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
380 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
381 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
382
383 // No native support for these.
386 setOperationAction(ISD::FSQRT, VT, Expand);
387 setOperationAction(ISD::FSIN, VT, Expand);
388 setOperationAction(ISD::FCOS, VT, Expand);
389 setOperationAction(ISD::FTAN, VT, Expand);
390 setOperationAction(ISD::FPOW, VT, Expand);
391 setOperationAction(ISD::FLOG, VT, Expand);
392 setOperationAction(ISD::FLOG2, VT, Expand);
393 setOperationAction(ISD::FLOG10, VT, Expand);
394 setOperationAction(ISD::FEXP, VT, Expand);
395 setOperationAction(ISD::FEXP2, VT, Expand);
396 setOperationAction(ISD::FEXP10, VT, Expand);
397 setOperationAction(ISD::FNEARBYINT, VT, Expand);
398 }
399 }
400
401 // Custom Expand smaller than legal vector reductions to prevent false zero
402 // items being added.
403 setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
404 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
405 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
406 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
407 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
408 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
409 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
410 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
411
412 // We 'support' these types up to bitcast/load/store level, regardless of
413 // MVE integer-only / float support. Only doing FP data processing on the FP
414 // vector types is inhibited at integer-only level.
415 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
416 for (auto VT : LongTypes) {
417 addRegisterClass(VT, &ARM::MQPRRegClass);
418 setAllExpand(VT);
424 }
426
427 // We can do bitwise operations on v2i64 vectors
428 setOperationAction(ISD::AND, MVT::v2i64, Legal);
429 setOperationAction(ISD::OR, MVT::v2i64, Legal);
430 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
431
432 // It is legal to extload from v4i8 to v4i16 or v4i32.
433 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
434 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
435 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
436
437 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
443
444 // Some truncating stores are legal too.
445 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
446 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
447 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
448
449 // Pre and Post inc on these are legal, given the correct extends
450 for (unsigned im = (unsigned)ISD::PRE_INC;
451 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
452 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
457 }
458 }
459
460 // Predicate types
461 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
462 for (auto VT : pTypes) {
463 addRegisterClass(VT, &ARM::VCCRRegClass);
472 setOperationAction(ISD::LOAD, VT, Custom);
473 setOperationAction(ISD::STORE, VT, Custom);
478
479 if (!HasMVEFP) {
484 }
485 }
489 setOperationAction(ISD::OR, MVT::v2i1, Expand);
495
504}
505
507 return static_cast<const ARMBaseTargetMachine &>(getTargetMachine());
508}
509
511 const ARMSubtarget &STI)
512 : TargetLowering(TM_), Subtarget(&STI),
513 RegInfo(Subtarget->getRegisterInfo()),
514 Itins(Subtarget->getInstrItineraryData()) {
515 const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
516
519
520 const Triple &TT = TM.getTargetTriple();
521
522 if (TT.isOSBinFormatMachO()) {
523 // Uses VFP for Thumb libfuncs if available.
524 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
525 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
526 // clang-format off
527 static const struct {
528 const RTLIB::Libcall Op;
529 const RTLIB::LibcallImpl Impl;
530 } LibraryCalls[] = {
531 // Single-precision floating-point arithmetic.
532 { RTLIB::ADD_F32, RTLIB::impl___addsf3vfp },
533 { RTLIB::SUB_F32, RTLIB::impl___subsf3vfp },
534 { RTLIB::MUL_F32, RTLIB::impl___mulsf3vfp },
535 { RTLIB::DIV_F32, RTLIB::impl___divsf3vfp },
536
537 // Double-precision floating-point arithmetic.
538 { RTLIB::ADD_F64, RTLIB::impl___adddf3vfp },
539 { RTLIB::SUB_F64, RTLIB::impl___subdf3vfp },
540 { RTLIB::MUL_F64, RTLIB::impl___muldf3vfp },
541 { RTLIB::DIV_F64, RTLIB::impl___divdf3vfp },
542
543 // Single-precision comparisons.
544 { RTLIB::OEQ_F32, RTLIB::impl___eqsf2vfp },
545 { RTLIB::UNE_F32, RTLIB::impl___nesf2vfp },
546 { RTLIB::OLT_F32, RTLIB::impl___ltsf2vfp },
547 { RTLIB::OLE_F32, RTLIB::impl___lesf2vfp },
548 { RTLIB::OGE_F32, RTLIB::impl___gesf2vfp },
549 { RTLIB::OGT_F32, RTLIB::impl___gtsf2vfp },
550 { RTLIB::UO_F32, RTLIB::impl___unordsf2vfp },
551
552 // Double-precision comparisons.
553 { RTLIB::OEQ_F64, RTLIB::impl___eqdf2vfp },
554 { RTLIB::UNE_F64, RTLIB::impl___nedf2vfp },
555 { RTLIB::OLT_F64, RTLIB::impl___ltdf2vfp },
556 { RTLIB::OLE_F64, RTLIB::impl___ledf2vfp },
557 { RTLIB::OGE_F64, RTLIB::impl___gedf2vfp },
558 { RTLIB::OGT_F64, RTLIB::impl___gtdf2vfp },
559 { RTLIB::UO_F64, RTLIB::impl___unorddf2vfp },
560
561 // Floating-point to integer conversions.
562 // i64 conversions are done via library routines even when generating VFP
563 // instructions, so use the same ones.
564 { RTLIB::FPTOSINT_F64_I32, RTLIB::impl___fixdfsivfp },
565 { RTLIB::FPTOUINT_F64_I32, RTLIB::impl___fixunsdfsivfp },
566 { RTLIB::FPTOSINT_F32_I32, RTLIB::impl___fixsfsivfp },
567 { RTLIB::FPTOUINT_F32_I32, RTLIB::impl___fixunssfsivfp },
568
569 // Conversions between floating types.
570 { RTLIB::FPROUND_F64_F32, RTLIB::impl___truncdfsf2vfp },
571 { RTLIB::FPEXT_F32_F64, RTLIB::impl___extendsfdf2vfp },
572
573 // Integer to floating-point conversions.
574 // i64 conversions are done via library routines even when generating VFP
575 // instructions, so use the same ones.
576 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
577 // e.g., __floatunsidf vs. __floatunssidfvfp.
578 { RTLIB::SINTTOFP_I32_F64, RTLIB::impl___floatsidfvfp },
579 { RTLIB::UINTTOFP_I32_F64, RTLIB::impl___floatunssidfvfp },
580 { RTLIB::SINTTOFP_I32_F32, RTLIB::impl___floatsisfvfp },
581 { RTLIB::UINTTOFP_I32_F32, RTLIB::impl___floatunssisfvfp },
582 };
583 // clang-format on
584
585 for (const auto &LC : LibraryCalls)
586 setLibcallImpl(LC.Op, LC.Impl);
587 }
588 }
589
590 if (Subtarget->isThumb1Only())
591 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
592 else
593 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
594
595 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
596 Subtarget->hasFPRegs()) {
597 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
598 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
599
604
605 if (!Subtarget->hasVFP2Base())
606 setAllExpand(MVT::f32);
607 if (!Subtarget->hasFP64())
608 setAllExpand(MVT::f64);
609 }
610
611 if (Subtarget->hasFullFP16()) {
612 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
613 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
614 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
615
616 setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
617 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
618 }
619
620 if (Subtarget->hasBF16()) {
621 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
622 setAllExpand(MVT::bf16);
623 if (!Subtarget->hasFullFP16())
624 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
625 } else {
626 setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
627 setOperationAction(ISD::BF16_TO_FP, MVT::f64, Expand);
628 setOperationAction(ISD::FP_TO_BF16, MVT::f32, Custom);
629 setOperationAction(ISD::FP_TO_BF16, MVT::f64, Custom);
630 }
631
633 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
634 setTruncStoreAction(VT, InnerVT, Expand);
635 addAllExtLoads(VT, InnerVT, Expand);
636 }
637
640
642 }
643
644 if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
646
647 if (!Subtarget->hasV8_1MMainlineOps())
649
650 if (!Subtarget->isThumb1Only())
652
655
658
659 if (Subtarget->hasMVEIntegerOps())
660 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
661
662 // Combine low-overhead loop intrinsics so that we can lower i1 types.
663 if (Subtarget->hasLOB()) {
664 setTargetDAGCombine({ISD::BRCOND, ISD::BR_CC});
665 }
666
667 if (Subtarget->hasNEON()) {
668 addDRTypeForNEON(MVT::v2f32);
669 addDRTypeForNEON(MVT::v8i8);
670 addDRTypeForNEON(MVT::v4i16);
671 addDRTypeForNEON(MVT::v2i32);
672 addDRTypeForNEON(MVT::v1i64);
673
674 addQRTypeForNEON(MVT::v4f32);
675 addQRTypeForNEON(MVT::v2f64);
676 addQRTypeForNEON(MVT::v16i8);
677 addQRTypeForNEON(MVT::v8i16);
678 addQRTypeForNEON(MVT::v4i32);
679 addQRTypeForNEON(MVT::v2i64);
680
681 if (Subtarget->hasFullFP16()) {
682 addQRTypeForNEON(MVT::v8f16);
683 addDRTypeForNEON(MVT::v4f16);
684 }
685
686 if (Subtarget->hasBF16()) {
687 addQRTypeForNEON(MVT::v8bf16);
688 addDRTypeForNEON(MVT::v4bf16);
689 }
690 }
691
692 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
693 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
694 // none of Neon, MVE or VFP supports any arithmetic operations on it.
695 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
696 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
697 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
698 // FIXME: Code duplication: FDIV and FREM are expanded always, see
699 // ARMTargetLowering::addTypeForNEON method for details.
700 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
701 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
702 // FIXME: Create unittest.
703 // In another words, find a way when "copysign" appears in DAG with vector
704 // operands.
706 // FIXME: Code duplication: SETCC has custom operation action, see
707 // ARMTargetLowering::addTypeForNEON method for details.
709 // FIXME: Create unittest for FNEG and for FABS.
710 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
711 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
712 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
713 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
714 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
715 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
716 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
717 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
718 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
719 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
720 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
721 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
722 setOperationAction(ISD::FEXP10, MVT::v2f64, Expand);
723 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
724 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
725 setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
726 setOperationAction(ISD::FROUNDEVEN, MVT::v2f64, Expand);
727 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
728 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
729 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
730 }
731
732 if (Subtarget->hasNEON()) {
733 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
734 // supported for v4f32.
735 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
736 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
737 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
738 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
739 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
740 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
741 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
742 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
743 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
744 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
745 setOperationAction(ISD::FEXP10, MVT::v4f32, Expand);
746 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
747 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
748 setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
749 setOperationAction(ISD::FROUNDEVEN, MVT::v4f32, Expand);
750 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
751 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
752
753 // Mark v2f32 intrinsics.
754 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
755 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
756 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
757 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
758 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
759 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
760 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
761 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
762 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
763 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
764 setOperationAction(ISD::FEXP10, MVT::v2f32, Expand);
765 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
766 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
767 setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
768 setOperationAction(ISD::FROUNDEVEN, MVT::v2f32, Expand);
769 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
770 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
771
772 for (ISD::NodeType Op : {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
773 ISD::FRINT, ISD::FTRUNC, ISD::FROUNDEVEN}) {
774 setOperationAction(Op, MVT::v4f16, Expand);
775 setOperationAction(Op, MVT::v8f16, Expand);
776 }
777
778 // Neon does not support some operations on v1i64 and v2i64 types.
779 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
780 // Custom handling for some quad-vector types to detect VMULL.
781 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
782 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
783 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
784 // Custom handling for some vector types to avoid expensive expansions
785 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
787 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
789 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
790 // a destination type that is wider than the source, and nor does
791 // it have a FP_TO_[SU]INT instruction with a narrower destination than
792 // source.
801
803 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
804
805 // NEON does not have single instruction CTPOP for vectors with element
806 // types wider than 8-bits. However, custom lowering can leverage the
807 // v8i8/v16i8 vcnt instruction.
814
815 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
816 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
817
818 // NEON does not have single instruction CTTZ for vectors.
820 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
821 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
822 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
823
824 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
825 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
826 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
827 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
828
833
838
842 }
843
844 // NEON only has FMA instructions as of VFP4.
845 if (!Subtarget->hasVFP4Base()) {
846 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
847 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
848 }
849
851 ISD::FP_TO_UINT, ISD::FMUL, ISD::LOAD});
852
853 // It is legal to extload from v4i8 to v4i16 or v4i32.
854 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
855 MVT::v2i32}) {
860 }
861 }
862
863 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
864 MVT::v4i32}) {
865 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
866 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
867 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
868 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
869 }
870 }
871
872 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
878 ISD::INTRINSIC_VOID, ISD::VECREDUCE_ADD, ISD::ADD, ISD::BITCAST});
879 }
880 if (Subtarget->hasMVEIntegerOps()) {
882 ISD::FP_EXTEND, ISD::SELECT, ISD::SELECT_CC,
883 ISD::SETCC});
884 }
885 if (Subtarget->hasMVEFloatOps()) {
887 }
888
889 if (!Subtarget->hasFP64()) {
890 // When targeting a floating-point unit with only single-precision
891 // operations, f64 is legal for the few double-precision instructions which
892 // are present However, no double-precision operations other than moves,
893 // loads and stores are provided by the hardware.
902 setOperationAction(ISD::FNEG, MVT::f64, Expand);
903 setOperationAction(ISD::FABS, MVT::f64, Expand);
904 setOperationAction(ISD::FSQRT, MVT::f64, Expand);
905 setOperationAction(ISD::FSIN, MVT::f64, Expand);
906 setOperationAction(ISD::FCOS, MVT::f64, Expand);
907 setOperationAction(ISD::FPOW, MVT::f64, Expand);
908 setOperationAction(ISD::FLOG, MVT::f64, Expand);
909 setOperationAction(ISD::FLOG2, MVT::f64, Expand);
910 setOperationAction(ISD::FLOG10, MVT::f64, Expand);
911 setOperationAction(ISD::FEXP, MVT::f64, Expand);
912 setOperationAction(ISD::FEXP2, MVT::f64, Expand);
913 setOperationAction(ISD::FEXP10, MVT::f64, Expand);
914 setOperationAction(ISD::FCEIL, MVT::f64, Expand);
915 setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
916 setOperationAction(ISD::FRINT, MVT::f64, Expand);
917 setOperationAction(ISD::FROUNDEVEN, MVT::f64, Expand);
918 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
919 setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
932 }
933
934 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
935 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
937 if (Subtarget->hasFullFP16()) {
940 }
941 }
942
943 if (!Subtarget->hasFP16()) {
944 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
946 }
947
948 computeRegisterProperties(Subtarget->getRegisterInfo());
949
950 // ARM does not have floating-point extending loads.
951 for (MVT VT : MVT::fp_valuetypes()) {
952 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
953 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
954 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
955 }
956
957 // ... or truncating stores
958 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
959 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
960 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
961 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
962 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
963
964 // ARM does not have i1 sign extending load.
965 for (MVT VT : MVT::integer_valuetypes())
967
968 // ARM supports all 4 flavors of integer indexed load / store.
969 if (!Subtarget->isThumb1Only()) {
970 for (unsigned im = (unsigned)ISD::PRE_INC;
972 setIndexedLoadAction(im, MVT::i1, Legal);
973 setIndexedLoadAction(im, MVT::i8, Legal);
974 setIndexedLoadAction(im, MVT::i16, Legal);
975 setIndexedLoadAction(im, MVT::i32, Legal);
976 setIndexedStoreAction(im, MVT::i1, Legal);
977 setIndexedStoreAction(im, MVT::i8, Legal);
978 setIndexedStoreAction(im, MVT::i16, Legal);
979 setIndexedStoreAction(im, MVT::i32, Legal);
980 }
981 } else {
982 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
985 }
986
991
994 if (Subtarget->hasDSP()) {
1003 }
1004 if (Subtarget->hasBaseDSP()) {
1007 }
1008
1009 // i64 operation support.
1012 if (Subtarget->isThumb1Only()) {
1015 }
1016 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1017 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1019
1027 setOperationAction(ISD::LOAD, MVT::i64, Custom);
1028 setOperationAction(ISD::STORE, MVT::i64, Custom);
1029
1030 // MVE lowers 64 bit shifts to lsll and lsrl
1031 // assuming that ISD::SRL and SRA of i64 are already marked custom
1032 if (Subtarget->hasMVEIntegerOps())
1034
1035 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1036 if (Subtarget->isThumb1Only()) {
1040 }
1041
1042 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1044
1045 // ARM does not have ROTL.
1050 }
1052 // TODO: These two should be set to LibCall, but this currently breaks
1053 // the Linux kernel build. See #101786.
1056 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1059 }
1060
1061 // @llvm.readcyclecounter requires the Performance Monitors extension.
1062 // Default to the 0 expansion on unsupported platforms.
1063 // FIXME: Technically there are older ARM CPUs that have
1064 // implementation-specific ways of obtaining this information.
1065 if (Subtarget->hasPerfMon())
1066 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
1067
1068 // Only ARMv6 has BSWAP.
1069 if (!Subtarget->hasV6Ops())
1071
1072 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1073 : Subtarget->hasDivideInARMMode();
1074 if (!hasDivide) {
1075 // These are expanded into libcalls if the cpu doesn't have HW divider.
1078 }
1079
1080 if (TT.isOSWindows() && !Subtarget->hasDivideInThumbMode()) {
1083
1086 }
1087
1090
1091 // Register based DivRem for AEABI (RTABI 4.2)
1092 if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
1093 TT.isTargetMuslAEABI() || TT.isOSWindows()) {
1096 HasStandaloneRem = false;
1097
1102 } else {
1105 }
1106
1111
1112 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1113 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1114
1115 // Use the default implementation.
1116 setOperationAction(ISD::VASTART, MVT::Other, Custom);
1117 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1118 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
1119 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1120 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
1121 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
1122
1123 if (TT.isOSWindows())
1124 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
1125 else
1126 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
1127
1128 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1129 // the default expansion.
1130 InsertFencesForAtomic = false;
1131 if (Subtarget->hasAnyDataBarrier() &&
1132 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1133 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1134 // to ldrex/strex loops already.
1135 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
1136 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1137 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
1138
1139 // On v8, we have particularly efficient implementations of atomic fences
1140 // if they can be combined with nearby atomic loads and stores.
1141 if (!Subtarget->hasAcquireRelease() ||
1142 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1143 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1144 InsertFencesForAtomic = true;
1145 }
1146 } else {
1147 // If there's anything we can use as a barrier, go through custom lowering
1148 // for ATOMIC_FENCE.
1149 // If target has DMB in thumb, Fences can be inserted.
1150 if (Subtarget->hasDataBarrier())
1151 InsertFencesForAtomic = true;
1152
1153 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
1154 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1155
1156 // Set them all for libcall, which will force libcalls.
1157 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
1158 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
1159 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
1160 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
1161 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, LibCall);
1162 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
1163 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
1164 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, LibCall);
1165 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, LibCall);
1166 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, LibCall);
1167 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, LibCall);
1168 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, LibCall);
1169 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1170 // Unordered/Monotonic case.
1171 if (!InsertFencesForAtomic) {
1172 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1173 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1174 }
1175 }
1176
1177 // Compute supported atomic widths.
1178 if (TT.isOSLinux() || (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1179 // For targets where __sync_* routines are reliably available, we use them
1180 // if necessary.
1181 //
1182 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1183 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1184 //
1185 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1186 // such targets should provide __sync_* routines, which use the ARM mode
1187 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1188 // encoding; see ARMISD::MEMBARRIER_MCR.)
1190 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1191 Subtarget->hasForced32BitAtomics()) {
1192 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1194 } else {
1195 // We can't assume anything about other targets; just use libatomic
1196 // routines.
1198 }
1199
1201
1202 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
1203
1204 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1205 if (!Subtarget->hasV6Ops()) {
1208 }
1210
1211 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1212 !Subtarget->isThumb1Only()) {
1213 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1214 // iff target supports vfp2.
1215 setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1217 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
1218 setOperationAction(ISD::GET_FPENV, MVT::i32, Legal);
1219 setOperationAction(ISD::SET_FPENV, MVT::i32, Legal);
1220 setOperationAction(ISD::RESET_FPENV, MVT::Other, Legal);
1221 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
1222 setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
1223 setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
1224 }
1225
1226 // We want to custom lower some of our intrinsics.
1231
1241 if (Subtarget->hasFullFP16()) {
1245 }
1246
1248
1249 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
1250 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
1251 if (Subtarget->hasFullFP16())
1252 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
1253 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
1254 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
1255 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1256
1257 // We don't support sin/cos/fmod/copysign/pow
1258 setOperationAction(ISD::FSIN, MVT::f64, Expand);
1259 setOperationAction(ISD::FSIN, MVT::f32, Expand);
1260 setOperationAction(ISD::FCOS, MVT::f32, Expand);
1261 setOperationAction(ISD::FCOS, MVT::f64, Expand);
1262 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1263 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1266 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1267 !Subtarget->isThumb1Only()) {
1270 }
1271 setOperationAction(ISD::FPOW, MVT::f64, Expand);
1272 setOperationAction(ISD::FPOW, MVT::f32, Expand);
1273
1274 if (!Subtarget->hasVFP4Base()) {
1277 }
1278
1279 // Various VFP goodness
1280 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1281 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1282 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1283 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1284 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1285 }
1286
1287 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1288 if (!Subtarget->hasFP16()) {
1289 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1290 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1291 }
1292
1293 // Strict floating-point comparisons need custom lowering.
1300 }
1301
1302 // Use __sincos_stret if available.
1303 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1304 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1305 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1306 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1307 }
1308
1309 // FP-ARMv8 implements a lot of rounding-like FP operations.
1310 if (Subtarget->hasFPARMv8Base()) {
1311 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1312 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1313 setOperationAction(ISD::FROUND, MVT::f32, Legal);
1314 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1315 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1316 setOperationAction(ISD::FRINT, MVT::f32, Legal);
1317 setOperationAction(ISD::FROUNDEVEN, MVT::f32, Legal);
1318 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
1319 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
1320 if (Subtarget->hasNEON()) {
1321 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1322 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1323 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1324 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1325 }
1326
1327 if (Subtarget->hasFP64()) {
1328 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1329 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1330 setOperationAction(ISD::FROUND, MVT::f64, Legal);
1331 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1332 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1333 setOperationAction(ISD::FRINT, MVT::f64, Legal);
1334 setOperationAction(ISD::FROUNDEVEN, MVT::f64, Legal);
1335 setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1336 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1337 }
1338 }
1339
1340 // FP16 often need to be promoted to call lib functions
1341 if (Subtarget->hasFullFP16()) {
1344 setOperationAction(ISD::FSIN, MVT::f16, Promote);
1345 setOperationAction(ISD::FCOS, MVT::f16, Promote);
1346 setOperationAction(ISD::FTAN, MVT::f16, Promote);
1347 setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
1348 setOperationAction(ISD::FPOWI, MVT::f16, Promote);
1349 setOperationAction(ISD::FPOW, MVT::f16, Promote);
1350 setOperationAction(ISD::FEXP, MVT::f16, Promote);
1351 setOperationAction(ISD::FEXP2, MVT::f16, Promote);
1352 setOperationAction(ISD::FEXP10, MVT::f16, Promote);
1353 setOperationAction(ISD::FLOG, MVT::f16, Promote);
1354 setOperationAction(ISD::FLOG10, MVT::f16, Promote);
1355 setOperationAction(ISD::FLOG2, MVT::f16, Promote);
1356 setOperationAction(ISD::LRINT, MVT::f16, Expand);
1357
1358 setOperationAction(ISD::FROUND, MVT::f16, Legal);
1359 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
1360 setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
1361 setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
1362 setOperationAction(ISD::FRINT, MVT::f16, Legal);
1363 setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
1364 setOperationAction(ISD::FCEIL, MVT::f16, Legal);
1365 }
1366
1367 if (Subtarget->hasNEON()) {
1368 // vmin and vmax aren't available in a scalar form, so we can use
1369 // a NEON instruction with an undef lane instead.
1370 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
1371 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
1372 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
1373 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
1374 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
1375 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
1376 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
1377 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
1378
1379 if (Subtarget->hasV8Ops()) {
1380 setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
1381 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
1382 setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
1383 setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
1384 setOperationAction(ISD::FROUNDEVEN, MVT::v2f32, Legal);
1385 setOperationAction(ISD::FROUNDEVEN, MVT::v4f32, Legal);
1386 setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
1387 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
1388 setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
1389 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
1390 setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
1391 setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
1392 }
1393
1394 if (Subtarget->hasFullFP16()) {
1395 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
1396 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
1397 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
1398 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
1399
1400 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
1401 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
1402 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
1403 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
1404
1405 setOperationAction(ISD::FFLOOR, MVT::v4f16, Legal);
1406 setOperationAction(ISD::FFLOOR, MVT::v8f16, Legal);
1407 setOperationAction(ISD::FROUND, MVT::v4f16, Legal);
1408 setOperationAction(ISD::FROUND, MVT::v8f16, Legal);
1409 setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Legal);
1410 setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Legal);
1411 setOperationAction(ISD::FCEIL, MVT::v4f16, Legal);
1412 setOperationAction(ISD::FCEIL, MVT::v8f16, Legal);
1413 setOperationAction(ISD::FTRUNC, MVT::v4f16, Legal);
1414 setOperationAction(ISD::FTRUNC, MVT::v8f16, Legal);
1415 setOperationAction(ISD::FRINT, MVT::v4f16, Legal);
1416 setOperationAction(ISD::FRINT, MVT::v8f16, Legal);
1417 }
1418 }
1419
1420 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1421 // it, but it's just a wrapper around ldexp.
1422 if (TT.isOSWindows()) {
1423 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1424 if (isOperationExpand(Op, MVT::f32))
1425 setOperationAction(Op, MVT::f32, Promote);
1426 }
1427
1428 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1429 // isn't legal.
1430 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1431 if (isOperationExpand(Op, MVT::f16))
1432 setOperationAction(Op, MVT::f16, Promote);
1433
1434 // We have target-specific dag combine patterns for the following nodes:
1435 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1438
1439 if (Subtarget->hasMVEIntegerOps())
1441
1442 if (Subtarget->hasV6Ops())
1444 if (Subtarget->isThumb1Only())
1446 // Attempt to lower smin/smax to ssat/usat
1447 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1448 Subtarget->isThumb2()) {
1450 }
1451
1453
1454 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1455 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1457 else
1459
1460 //// temporary - rewrite interface to use type
1463 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1465 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1467
1468 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1469 // are at least 4 bytes aligned.
1471
1472 // Prefer likely predicted branches to selects on out-of-order cores.
1473 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1474
1475 setPrefLoopAlignment(Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1477 Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1478
1479 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1480}
1481
1483 return Subtarget->useSoftFloat();
1484}
1485
1487 return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1488}
1489
1490// FIXME: It might make sense to define the representative register class as the
1491// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1492// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1493// SPR's representative would be DPR_VFP2. This should work well if register
1494// pressure tracking were modified such that a register use would increment the
1495// pressure of the register class's representative and all of it's super
1496// classes' representatives transitively. We have not implemented this because
1497// of the difficulty prior to coalescing of modeling operand register classes
1498// due to the common occurrence of cross class copies and subregister insertions
1499// and extractions.
1500std::pair<const TargetRegisterClass *, uint8_t>
1502 MVT VT) const {
1503 const TargetRegisterClass *RRC = nullptr;
1504 uint8_t Cost = 1;
1505 switch (VT.SimpleTy) {
1506 default:
1508 // Use DPR as representative register class for all floating point
1509 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1510 // the cost is 1 for both f32 and f64.
1511 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1512 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1513 RRC = &ARM::DPRRegClass;
1514 // When NEON is used for SP, only half of the register file is available
1515 // because operations that define both SP and DP results will be constrained
1516 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1517 // coalescing by double-counting the SP regs. See the FIXME above.
1518 if (Subtarget->useNEONForSinglePrecisionFP())
1519 Cost = 2;
1520 break;
1521 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1522 case MVT::v4f32: case MVT::v2f64:
1523 RRC = &ARM::DPRRegClass;
1524 Cost = 2;
1525 break;
1526 case MVT::v4i64:
1527 RRC = &ARM::DPRRegClass;
1528 Cost = 4;
1529 break;
1530 case MVT::v8i64:
1531 RRC = &ARM::DPRRegClass;
1532 Cost = 8;
1533 break;
1534 }
1535 return std::make_pair(RRC, Cost);
1536}
1537
1538const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1539#define MAKE_CASE(V) \
1540 case V: \
1541 return #V;
1542 switch ((ARMISD::NodeType)Opcode) {
1544 break;
1747#undef MAKE_CASE
1748 }
1749 return nullptr;
1750}
1751
1753 EVT VT) const {
1754 if (!VT.isVector())
1755 return getPointerTy(DL);
1756
1757 // MVE has a predicate register.
1758 if ((Subtarget->hasMVEIntegerOps() &&
1759 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1760 VT == MVT::v16i8)) ||
1761 (Subtarget->hasMVEFloatOps() &&
1762 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1763 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1765}
1766
1767/// getRegClassFor - Return the register class that should be used for the
1768/// specified value type.
1769const TargetRegisterClass *
1770ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1771 (void)isDivergent;
1772 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1773 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1774 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1775 // MVE Q registers.
1776 if (Subtarget->hasNEON()) {
1777 if (VT == MVT::v4i64)
1778 return &ARM::QQPRRegClass;
1779 if (VT == MVT::v8i64)
1780 return &ARM::QQQQPRRegClass;
1781 }
1782 if (Subtarget->hasMVEIntegerOps()) {
1783 if (VT == MVT::v4i64)
1784 return &ARM::MQQPRRegClass;
1785 if (VT == MVT::v8i64)
1786 return &ARM::MQQQQPRRegClass;
1787 }
1789}
1790
1791// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1792// source/dest is aligned and the copy size is large enough. We therefore want
1793// to align such objects passed to memory intrinsics.
1795 Align &PrefAlign) const {
1796 if (!isa<MemIntrinsic>(CI))
1797 return false;
1798 MinSize = 8;
1799 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1800 // cycle faster than 4-byte aligned LDM.
1801 PrefAlign =
1802 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1803 return true;
1804}
1805
1806// Create a fast isel object.
1807FastISel *
1809 const TargetLibraryInfo *libInfo) const {
1810 return ARM::createFastISel(funcInfo, libInfo);
1811}
1812
1814 unsigned NumVals = N->getNumValues();
1815 if (!NumVals)
1816 return Sched::RegPressure;
1817
1818 for (unsigned i = 0; i != NumVals; ++i) {
1819 EVT VT = N->getValueType(i);
1820 if (VT == MVT::Glue || VT == MVT::Other)
1821 continue;
1822 if (VT.isFloatingPoint() || VT.isVector())
1823 return Sched::ILP;
1824 }
1825
1826 if (!N->isMachineOpcode())
1827 return Sched::RegPressure;
1828
1829 // Load are scheduled for latency even if there instruction itinerary
1830 // is not available.
1831 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1832 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1833
1834 if (MCID.getNumDefs() == 0)
1835 return Sched::RegPressure;
1836 if (!Itins->isEmpty() &&
1837 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1838 return Sched::ILP;
1839
1840 return Sched::RegPressure;
1841}
1842
1843//===----------------------------------------------------------------------===//
1844// Lowering Code
1845//===----------------------------------------------------------------------===//
1846
1847static bool isSRL16(const SDValue &Op) {
1848 if (Op.getOpcode() != ISD::SRL)
1849 return false;
1850 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1851 return Const->getZExtValue() == 16;
1852 return false;
1853}
1854
1855static bool isSRA16(const SDValue &Op) {
1856 if (Op.getOpcode() != ISD::SRA)
1857 return false;
1858 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1859 return Const->getZExtValue() == 16;
1860 return false;
1861}
1862
1863static bool isSHL16(const SDValue &Op) {
1864 if (Op.getOpcode() != ISD::SHL)
1865 return false;
1866 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1867 return Const->getZExtValue() == 16;
1868 return false;
1869}
1870
1871// Check for a signed 16-bit value. We special case SRA because it makes it
1872// more simple when also looking for SRAs that aren't sign extending a
1873// smaller value. Without the check, we'd need to take extra care with
1874// checking order for some operations.
1875static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1876 if (isSRA16(Op))
1877 return isSHL16(Op.getOperand(0));
1878 return DAG.ComputeNumSignBits(Op) == 17;
1879}
1880
1881/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1883 switch (CC) {
1884 default: llvm_unreachable("Unknown condition code!");
1885 case ISD::SETNE: return ARMCC::NE;
1886 case ISD::SETEQ: return ARMCC::EQ;
1887 case ISD::SETGT: return ARMCC::GT;
1888 case ISD::SETGE: return ARMCC::GE;
1889 case ISD::SETLT: return ARMCC::LT;
1890 case ISD::SETLE: return ARMCC::LE;
1891 case ISD::SETUGT: return ARMCC::HI;
1892 case ISD::SETUGE: return ARMCC::HS;
1893 case ISD::SETULT: return ARMCC::LO;
1894 case ISD::SETULE: return ARMCC::LS;
1895 }
1896}
1897
1898/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1900 ARMCC::CondCodes &CondCode2) {
1901 CondCode2 = ARMCC::AL;
1902 switch (CC) {
1903 default: llvm_unreachable("Unknown FP condition!");
1904 case ISD::SETEQ:
1905 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1906 case ISD::SETGT:
1907 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1908 case ISD::SETGE:
1909 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1910 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1911 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1912 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1913 case ISD::SETO: CondCode = ARMCC::VC; break;
1914 case ISD::SETUO: CondCode = ARMCC::VS; break;
1915 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1916 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1917 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1918 case ISD::SETLT:
1919 case ISD::SETULT: CondCode = ARMCC::LT; break;
1920 case ISD::SETLE:
1921 case ISD::SETULE: CondCode = ARMCC::LE; break;
1922 case ISD::SETNE:
1923 case ISD::SETUNE: CondCode = ARMCC::NE; break;
1924 }
1925}
1926
1927//===----------------------------------------------------------------------===//
1928// Calling Convention Implementation
1929//===----------------------------------------------------------------------===//
1930
1931/// getEffectiveCallingConv - Get the effective calling convention, taking into
1932/// account presence of floating point hardware and calling convention
1933/// limitations, such as support for variadic functions.
1935ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1936 bool isVarArg) const {
1937 switch (CC) {
1938 default:
1939 report_fatal_error("Unsupported calling convention");
1942 case CallingConv::GHC:
1944 return CC;
1950 case CallingConv::Swift:
1953 case CallingConv::C:
1954 case CallingConv::Tail:
1955 if (!getTM().isAAPCS_ABI())
1956 return CallingConv::ARM_APCS;
1957 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1958 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1959 !isVarArg)
1961 else
1963 case CallingConv::Fast:
1965 if (!getTM().isAAPCS_ABI()) {
1966 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
1967 return CallingConv::Fast;
1968 return CallingConv::ARM_APCS;
1969 } else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
1970 !isVarArg)
1972 else
1974 }
1975}
1976
1978 bool isVarArg) const {
1979 return CCAssignFnForNode(CC, false, isVarArg);
1980}
1981
1983 bool isVarArg) const {
1984 return CCAssignFnForNode(CC, true, isVarArg);
1985}
1986
1987/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1988/// CallingConvention.
1989CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1990 bool Return,
1991 bool isVarArg) const {
1992 switch (getEffectiveCallingConv(CC, isVarArg)) {
1993 default:
1994 report_fatal_error("Unsupported calling convention");
1996 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1998 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2000 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
2001 case CallingConv::Fast:
2002 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
2003 case CallingConv::GHC:
2004 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2006 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2008 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2010 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2011 }
2012}
2013
2014SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2015 MVT LocVT, MVT ValVT, SDValue Val) const {
2016 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
2017 Val);
2018 if (Subtarget->hasFullFP16()) {
2019 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
2020 } else {
2021 Val = DAG.getNode(ISD::TRUNCATE, dl,
2022 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2023 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
2024 }
2025 return Val;
2026}
2027
2028SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2029 MVT LocVT, MVT ValVT,
2030 SDValue Val) const {
2031 if (Subtarget->hasFullFP16()) {
2032 Val = DAG.getNode(ARMISD::VMOVrh, dl,
2033 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2034 } else {
2035 Val = DAG.getNode(ISD::BITCAST, dl,
2036 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2037 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
2038 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2039 }
2040 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
2041}
2042
2043/// LowerCallResult - Lower the result values of a call into the
2044/// appropriate copies out of appropriate physical registers.
2045SDValue ARMTargetLowering::LowerCallResult(
2046 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
2047 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2048 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2049 SDValue ThisVal, bool isCmseNSCall) const {
2050 // Assign locations to each value returned by this call.
2052 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2053 *DAG.getContext());
2054 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
2055
2056 // Copy all of the result registers out of their specified physreg.
2057 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2058 CCValAssign VA = RVLocs[i];
2059
2060 // Pass 'this' value directly from the argument to return value, to avoid
2061 // reg unit interference
2062 if (i == 0 && isThisReturn) {
2063 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2064 "unexpected return calling convention register assignment");
2065 InVals.push_back(ThisVal);
2066 continue;
2067 }
2068
2069 SDValue Val;
2070 if (VA.needsCustom() &&
2071 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2072 // Handle f64 or half of a v2f64.
2073 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2074 InGlue);
2075 Chain = Lo.getValue(1);
2076 InGlue = Lo.getValue(2);
2077 VA = RVLocs[++i]; // skip ahead to next loc
2078 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2079 InGlue);
2080 Chain = Hi.getValue(1);
2081 InGlue = Hi.getValue(2);
2082 if (!Subtarget->isLittle())
2083 std::swap (Lo, Hi);
2084 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2085
2086 if (VA.getLocVT() == MVT::v2f64) {
2087 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2088 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2089 DAG.getConstant(0, dl, MVT::i32));
2090
2091 VA = RVLocs[++i]; // skip ahead to next loc
2092 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2093 Chain = Lo.getValue(1);
2094 InGlue = Lo.getValue(2);
2095 VA = RVLocs[++i]; // skip ahead to next loc
2096 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2097 Chain = Hi.getValue(1);
2098 InGlue = Hi.getValue(2);
2099 if (!Subtarget->isLittle())
2100 std::swap (Lo, Hi);
2101 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2102 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2103 DAG.getConstant(1, dl, MVT::i32));
2104 }
2105 } else {
2106 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
2107 InGlue);
2108 Chain = Val.getValue(1);
2109 InGlue = Val.getValue(2);
2110 }
2111
2112 switch (VA.getLocInfo()) {
2113 default: llvm_unreachable("Unknown loc info!");
2114 case CCValAssign::Full: break;
2115 case CCValAssign::BCvt:
2116 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
2117 break;
2118 }
2119
2120 // f16 arguments have their size extended to 4 bytes and passed as if they
2121 // had been copied to the LSBs of a 32-bit register.
2122 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2123 if (VA.needsCustom() &&
2124 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2125 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
2126
2127 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
2128 // is less than 32 bits must be sign- or zero-extended after the call for
2129 // security reasons. Although the ABI mandates an extension done by the
2130 // callee, the latter cannot be trusted to follow the rules of the ABI.
2131 const ISD::InputArg &Arg = Ins[VA.getValNo()];
2132 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
2133 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
2134 Val = handleCMSEValue(Val, Arg, DAG, dl);
2135
2136 InVals.push_back(Val);
2137 }
2138
2139 return Chain;
2140}
2141
2142std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
2143 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
2144 bool IsTailCall, int SPDiff) const {
2145 SDValue DstAddr;
2146 MachinePointerInfo DstInfo;
2147 int32_t Offset = VA.getLocMemOffset();
2148 MachineFunction &MF = DAG.getMachineFunction();
2149
2150 if (IsTailCall) {
2151 Offset += SPDiff;
2152 auto PtrVT = getPointerTy(DAG.getDataLayout());
2153 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
2154 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
2155 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2156 DstInfo =
2158 } else {
2159 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
2160 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2161 StackPtr, PtrOff);
2162 DstInfo =
2164 }
2165
2166 return std::make_pair(DstAddr, DstInfo);
2167}
2168
2169// Returns the type of copying which is required to set up a byval argument to
2170// a tail-called function. This isn't needed for non-tail calls, because they
2171// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
2172// avoid clobbering another argument (CopyViaTemp), and sometimes can be
2173// optimised to zero copies when forwarding an argument from the caller's
2174// caller (NoCopy).
2175ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall(
2176 SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
2177 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2178 ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
2179
2180 // Globals are always safe to copy from.
2182 return CopyOnce;
2183
2184 // Can only analyse frame index nodes, conservatively assume we need a
2185 // temporary.
2186 auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
2187 auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
2188 if (!SrcFrameIdxNode || !DstFrameIdxNode)
2189 return CopyViaTemp;
2190
2191 int SrcFI = SrcFrameIdxNode->getIndex();
2192 int DstFI = DstFrameIdxNode->getIndex();
2193 assert(MFI.isFixedObjectIndex(DstFI) &&
2194 "byval passed in non-fixed stack slot");
2195
2196 int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
2197 int64_t DstOffset = MFI.getObjectOffset(DstFI);
2198
2199 // If the source is in the local frame, then the copy to the argument memory
2200 // is always valid.
2201 bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
2202 if (!FixedSrc ||
2203 (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize()))
2204 return CopyOnce;
2205
2206 // In the case of byval arguments split between registers and the stack,
2207 // computeAddrForCallArg returns a FrameIndex which corresponds only to the
2208 // stack portion, but the Src SDValue will refer to the full value, including
2209 // the local stack memory that the register portion gets stored into. We only
2210 // need to compare them for equality, so normalise on the full value version.
2211 uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI);
2212 DstOffset -= RegSize;
2213
2214 // If the value is already in the correct location, then no copying is
2215 // needed. If not, then we need to copy via a temporary.
2216 if (SrcOffset == DstOffset)
2217 return NoCopy;
2218 else
2219 return CopyViaTemp;
2220}
2221
2222void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2223 SDValue Chain, SDValue &Arg,
2224 RegsToPassVector &RegsToPass,
2225 CCValAssign &VA, CCValAssign &NextVA,
2226 SDValue &StackPtr,
2227 SmallVectorImpl<SDValue> &MemOpChains,
2228 bool IsTailCall,
2229 int SPDiff) const {
2230 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2231 DAG.getVTList(MVT::i32, MVT::i32), Arg);
2232 unsigned id = Subtarget->isLittle() ? 0 : 1;
2233 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2234
2235 if (NextVA.isRegLoc())
2236 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2237 else {
2238 assert(NextVA.isMemLoc());
2239 if (!StackPtr.getNode())
2240 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2242
2243 SDValue DstAddr;
2244 MachinePointerInfo DstInfo;
2245 std::tie(DstAddr, DstInfo) =
2246 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2247 MemOpChains.push_back(
2248 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2249 }
2250}
2251
2252static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2253 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2255}
2256
2257/// LowerCall - Lowering a call into a callseq_start <-
2258/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2259/// nodes.
2260SDValue
2261ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2262 SmallVectorImpl<SDValue> &InVals) const {
2263 SelectionDAG &DAG = CLI.DAG;
2264 SDLoc &dl = CLI.DL;
2265 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2266 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2267 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2268 SDValue Chain = CLI.Chain;
2269 SDValue Callee = CLI.Callee;
2270 bool &isTailCall = CLI.IsTailCall;
2271 CallingConv::ID CallConv = CLI.CallConv;
2272 bool doesNotRet = CLI.DoesNotReturn;
2273 bool isVarArg = CLI.IsVarArg;
2274 const CallBase *CB = CLI.CB;
2275
2276 MachineFunction &MF = DAG.getMachineFunction();
2277 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2278 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2279 MachineFunction::CallSiteInfo CSInfo;
2280 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2281 bool isThisReturn = false;
2282 bool isCmseNSCall = false;
2283 bool isSibCall = false;
2284 bool PreferIndirect = false;
2285 bool GuardWithBTI = false;
2286
2287 // Analyze operands of the call, assigning locations to each operand.
2289 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2290 *DAG.getContext());
2291 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2292
2293 // Lower 'returns_twice' calls to a pseudo-instruction.
2294 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2295 !Subtarget->noBTIAtReturnTwice())
2296 GuardWithBTI = AFI->branchTargetEnforcement();
2297
2298 // Set type id for call site info.
2299 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
2300 CSInfo = MachineFunction::CallSiteInfo(*CB);
2301
2302 // Determine whether this is a non-secure function call.
2303 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2304 isCmseNSCall = true;
2305
2306 // Disable tail calls if they're not supported.
2307 if (!Subtarget->supportsTailCall())
2308 isTailCall = false;
2309
2310 // For both the non-secure calls and the returns from a CMSE entry function,
2311 // the function needs to do some extra work after the call, or before the
2312 // return, respectively, thus it cannot end with a tail call
2313 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2314 isTailCall = false;
2315
2316 if (isa<GlobalAddressSDNode>(Callee)) {
2317 // If we're optimizing for minimum size and the function is called three or
2318 // more times in this block, we can improve codesize by calling indirectly
2319 // as BLXr has a 16-bit encoding.
2320 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2321 if (CLI.CB) {
2322 auto *BB = CLI.CB->getParent();
2323 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2324 count_if(GV->users(), [&BB](const User *U) {
2325 return isa<Instruction>(U) &&
2326 cast<Instruction>(U)->getParent() == BB;
2327 }) > 2;
2328 }
2329 }
2330 if (isTailCall) {
2331 // Check if it's really possible to do a tail call.
2332 isTailCall =
2333 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2334
2335 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2336 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2337 isSibCall = true;
2338
2339 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2340 // detected sibcalls.
2341 if (isTailCall)
2342 ++NumTailCalls;
2343 }
2344
2345 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2346 report_fatal_error("failed to perform tail call elimination on a call "
2347 "site marked musttail");
2348
2349 // Get a count of how many bytes are to be pushed on the stack.
2350 unsigned NumBytes = CCInfo.getStackSize();
2351
2352 // SPDiff is the byte offset of the call's argument area from the callee's.
2353 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2354 // by this amount for a tail call. In a sibling call it must be 0 because the
2355 // caller will deallocate the entire stack and the callee still expects its
2356 // arguments to begin at SP+0. Completely unused for non-tail calls.
2357 int SPDiff = 0;
2358
2359 if (isTailCall && !isSibCall) {
2360 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2361 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2362
2363 // Since callee will pop argument stack as a tail call, we must keep the
2364 // popped size 16-byte aligned.
2365 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
2366 assert(StackAlign && "data layout string is missing stack alignment");
2367 NumBytes = alignTo(NumBytes, *StackAlign);
2368
2369 // SPDiff will be negative if this tail call requires more space than we
2370 // would automatically have in our incoming argument space. Positive if we
2371 // can actually shrink the stack.
2372 SPDiff = NumReusableBytes - NumBytes;
2373
2374 // If this call requires more stack than we have available from
2375 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2376 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2377 AFI->setArgRegsSaveSize(-SPDiff);
2378 }
2379
2380 if (isSibCall) {
2381 // For sibling tail calls, memory operands are available in our caller's stack.
2382 NumBytes = 0;
2383 } else {
2384 // Adjust the stack pointer for the new arguments...
2385 // These operations are automatically eliminated by the prolog/epilog pass
2386 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2387 }
2388
2390 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2391
2392 RegsToPassVector RegsToPass;
2393 SmallVector<SDValue, 8> MemOpChains;
2394
2395 // If we are doing a tail-call, any byval arguments will be written to stack
2396 // space which was used for incoming arguments. If any the values being used
2397 // are incoming byval arguments to this function, then they might be
2398 // overwritten by the stores of the outgoing arguments. To avoid this, we
2399 // need to make a temporary copy of them in local stack space, then copy back
2400 // to the argument area.
2401 DenseMap<unsigned, SDValue> ByValTemporaries;
2402 SDValue ByValTempChain;
2403 if (isTailCall) {
2404 SmallVector<SDValue, 8> ByValCopyChains;
2405 for (const CCValAssign &VA : ArgLocs) {
2406 unsigned ArgIdx = VA.getValNo();
2407 SDValue Src = OutVals[ArgIdx];
2408 ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2409
2410 if (!Flags.isByVal())
2411 continue;
2412
2413 SDValue Dst;
2414 MachinePointerInfo DstInfo;
2415 std::tie(Dst, DstInfo) =
2416 computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff);
2417 ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2418
2419 if (Copy == NoCopy) {
2420 // If the argument is already at the correct offset on the stack
2421 // (because we are forwarding a byval argument from our caller), we
2422 // don't need any copying.
2423 continue;
2424 } else if (Copy == CopyOnce) {
2425 // If the argument is in our local stack frame, no other argument
2426 // preparation can clobber it, so we can copy it to the final location
2427 // later.
2428 ByValTemporaries[ArgIdx] = Src;
2429 } else {
2430 assert(Copy == CopyViaTemp && "unexpected enum value");
2431 // If we might be copying this argument from the outgoing argument
2432 // stack area, we need to copy via a temporary in the local stack
2433 // frame.
2434 int TempFrameIdx = MFI.CreateStackObject(
2435 Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2436 SDValue Temp =
2437 DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
2438
2439 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2440 SDValue AlignNode =
2441 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2442
2443 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2444 SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode};
2445 ByValCopyChains.push_back(
2446 DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops));
2447 ByValTemporaries[ArgIdx] = Temp;
2448 }
2449 }
2450 if (!ByValCopyChains.empty())
2451 ByValTempChain =
2452 DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
2453 }
2454
2455 // During a tail call, stores to the argument area must happen after all of
2456 // the function's incoming arguments have been loaded because they may alias.
2457 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2458 // there's no point in doing so repeatedly so this tracks whether that's
2459 // happened yet.
2460 bool AfterFormalArgLoads = false;
2461
2462 // Walk the register/memloc assignments, inserting copies/loads. In the case
2463 // of tail call optimization, arguments are handled later.
2464 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2465 i != e;
2466 ++i, ++realArgIdx) {
2467 CCValAssign &VA = ArgLocs[i];
2468 SDValue Arg = OutVals[realArgIdx];
2469 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2470 bool isByVal = Flags.isByVal();
2471
2472 // Promote the value if needed.
2473 switch (VA.getLocInfo()) {
2474 default: llvm_unreachable("Unknown loc info!");
2475 case CCValAssign::Full: break;
2476 case CCValAssign::SExt:
2477 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2478 break;
2479 case CCValAssign::ZExt:
2480 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2481 break;
2482 case CCValAssign::AExt:
2483 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2484 break;
2485 case CCValAssign::BCvt:
2486 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2487 break;
2488 }
2489
2490 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2491 Chain = DAG.getStackArgumentTokenFactor(Chain);
2492 if (ByValTempChain)
2493 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
2494 ByValTempChain);
2495 AfterFormalArgLoads = true;
2496 }
2497
2498 // f16 arguments have their size extended to 4 bytes and passed as if they
2499 // had been copied to the LSBs of a 32-bit register.
2500 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2501 if (VA.needsCustom() &&
2502 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2503 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2504 } else {
2505 // f16 arguments could have been extended prior to argument lowering.
2506 // Mask them arguments if this is a CMSE nonsecure call.
2507 auto ArgVT = Outs[realArgIdx].ArgVT;
2508 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2509 auto LocBits = VA.getLocVT().getSizeInBits();
2510 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2511 SDValue Mask =
2512 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2513 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2514 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2515 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2516 }
2517 }
2518
2519 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2520 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2521 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2522 DAG.getConstant(0, dl, MVT::i32));
2523 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2524 DAG.getConstant(1, dl, MVT::i32));
2525
2526 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2527 StackPtr, MemOpChains, isTailCall, SPDiff);
2528
2529 VA = ArgLocs[++i]; // skip ahead to next loc
2530 if (VA.isRegLoc()) {
2531 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2532 StackPtr, MemOpChains, isTailCall, SPDiff);
2533 } else {
2534 assert(VA.isMemLoc());
2535 SDValue DstAddr;
2536 MachinePointerInfo DstInfo;
2537 std::tie(DstAddr, DstInfo) =
2538 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2539 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2540 }
2541 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2542 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2543 StackPtr, MemOpChains, isTailCall, SPDiff);
2544 } else if (VA.isRegLoc()) {
2545 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2546 Outs[0].VT == MVT::i32) {
2547 assert(VA.getLocVT() == MVT::i32 &&
2548 "unexpected calling convention register assignment");
2549 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2550 "unexpected use of 'returned'");
2551 isThisReturn = true;
2552 }
2553 const TargetOptions &Options = DAG.getTarget().Options;
2554 if (Options.EmitCallSiteInfo)
2555 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2556 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2557 } else if (isByVal) {
2558 assert(VA.isMemLoc());
2559 unsigned offset = 0;
2560
2561 // True if this byval aggregate will be split between registers
2562 // and memory.
2563 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2564 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2565
2566 SDValue ByValSrc;
2567 bool NeedsStackCopy;
2568 if (auto It = ByValTemporaries.find(realArgIdx);
2569 It != ByValTemporaries.end()) {
2570 ByValSrc = It->second;
2571 NeedsStackCopy = true;
2572 } else {
2573 ByValSrc = Arg;
2574 NeedsStackCopy = !isTailCall;
2575 }
2576
2577 // If part of the argument is in registers, load them.
2578 if (CurByValIdx < ByValArgsCount) {
2579 unsigned RegBegin, RegEnd;
2580 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2581
2582 EVT PtrVT = getPointerTy(DAG.getDataLayout());
2583 unsigned int i, j;
2584 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2585 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2586 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const);
2587 SDValue Load =
2588 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2589 DAG.InferPtrAlign(AddArg));
2590 MemOpChains.push_back(Load.getValue(1));
2591 RegsToPass.push_back(std::make_pair(j, Load));
2592 }
2593
2594 // If parameter size outsides register area, "offset" value
2595 // helps us to calculate stack slot for remained part properly.
2596 offset = RegEnd - RegBegin;
2597
2598 CCInfo.nextInRegsParam();
2599 }
2600
2601 // If the memory part of the argument isn't already in the correct place
2602 // (which can happen with tail calls), copy it into the argument area.
2603 if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) {
2604 auto PtrVT = getPointerTy(DAG.getDataLayout());
2605 SDValue Dst;
2606 MachinePointerInfo DstInfo;
2607 std::tie(Dst, DstInfo) =
2608 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2609 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2610 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset);
2611 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2612 MVT::i32);
2613 SDValue AlignNode =
2614 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2615
2616 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2617 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2618 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2619 Ops));
2620 }
2621 } else {
2622 assert(VA.isMemLoc());
2623 SDValue DstAddr;
2624 MachinePointerInfo DstInfo;
2625 std::tie(DstAddr, DstInfo) =
2626 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2627
2628 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2629 MemOpChains.push_back(Store);
2630 }
2631 }
2632
2633 if (!MemOpChains.empty())
2634 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2635
2636 // Build a sequence of copy-to-reg nodes chained together with token chain
2637 // and flag operands which copy the outgoing args into the appropriate regs.
2638 SDValue InGlue;
2639 for (const auto &[Reg, N] : RegsToPass) {
2640 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2641 InGlue = Chain.getValue(1);
2642 }
2643
2644 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2645 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2646 // node so that legalize doesn't hack it.
2647 bool isDirect = false;
2648
2649 const TargetMachine &TM = getTargetMachine();
2650 const GlobalValue *GVal = nullptr;
2651 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2652 GVal = G->getGlobal();
2653 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && Subtarget->isTargetMachO();
2654
2655 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2656 bool isLocalARMFunc = false;
2657 auto PtrVt = getPointerTy(DAG.getDataLayout());
2658
2659 if (Subtarget->genLongCalls()) {
2660 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2661 "long-calls codegen is not position independent!");
2662 // Handle a global address or an external symbol. If it's not one of
2663 // those, the target's already in a register, so we don't need to do
2664 // anything extra.
2665 if (isa<GlobalAddressSDNode>(Callee)) {
2666 if (Subtarget->genExecuteOnly()) {
2667 if (Subtarget->useMovt())
2668 ++NumMovwMovt;
2669 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2670 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2671 } else {
2672 // Create a constant pool entry for the callee address
2673 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2674 ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
2675 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2676
2677 // Get the address of the callee into a register
2678 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2679 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2680 Callee = DAG.getLoad(
2681 PtrVt, dl, DAG.getEntryNode(), Addr,
2683 }
2684 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2685 const char *Sym = S->getSymbol();
2686
2687 if (Subtarget->genExecuteOnly()) {
2688 if (Subtarget->useMovt())
2689 ++NumMovwMovt;
2690 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2691 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2692 } else {
2693 // Create a constant pool entry for the callee address
2694 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2695 ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
2696 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2697
2698 // Get the address of the callee into a register
2699 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2700 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2701 Callee = DAG.getLoad(
2702 PtrVt, dl, DAG.getEntryNode(), Addr,
2704 }
2705 }
2706 } else if (isa<GlobalAddressSDNode>(Callee)) {
2707 if (!PreferIndirect) {
2708 isDirect = true;
2709 bool isDef = GVal->isStrongDefinitionForLinker();
2710
2711 // ARM call to a local ARM function is predicable.
2712 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2713 // tBX takes a register source operand.
2714 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2715 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2716 Callee = DAG.getNode(
2717 ARMISD::WrapperPIC, dl, PtrVt,
2718 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2719 Callee = DAG.getLoad(
2720 PtrVt, dl, DAG.getEntryNode(), Callee,
2724 } else if (Subtarget->isTargetCOFF()) {
2725 assert(Subtarget->isTargetWindows() &&
2726 "Windows is the only supported COFF target");
2727 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2728 if (GVal->hasDLLImportStorageClass())
2729 TargetFlags = ARMII::MO_DLLIMPORT;
2730 else if (!TM.shouldAssumeDSOLocal(GVal))
2731 TargetFlags = ARMII::MO_COFFSTUB;
2732 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2733 TargetFlags);
2734 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2735 Callee =
2736 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2737 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2739 } else {
2740 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2741 }
2742 }
2743 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2744 isDirect = true;
2745 // tBX takes a register source operand.
2746 const char *Sym = S->getSymbol();
2747 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2748 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2749 ARMConstantPoolValue *CPV =
2751 ARMPCLabelIndex, 4);
2752 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2753 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2754 Callee = DAG.getLoad(
2755 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2757 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2758 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2759 } else {
2760 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2761 }
2762 }
2763
2764 if (isCmseNSCall) {
2765 assert(!isARMFunc && !isDirect &&
2766 "Cannot handle call to ARM function or direct call");
2767 if (NumBytes > 0) {
2768 DAG.getContext()->diagnose(
2769 DiagnosticInfoUnsupported(DAG.getMachineFunction().getFunction(),
2770 "call to non-secure function would require "
2771 "passing arguments on stack",
2772 dl.getDebugLoc()));
2773 }
2774 if (isStructRet) {
2775 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2777 "call to non-secure function would return value through pointer",
2778 dl.getDebugLoc()));
2779 }
2780 }
2781
2782 // FIXME: handle tail calls differently.
2783 unsigned CallOpc;
2784 if (Subtarget->isThumb()) {
2785 if (GuardWithBTI)
2786 CallOpc = ARMISD::t2CALL_BTI;
2787 else if (isCmseNSCall)
2788 CallOpc = ARMISD::tSECALL;
2789 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2790 CallOpc = ARMISD::CALL_NOLINK;
2791 else
2792 CallOpc = ARMISD::CALL;
2793 } else {
2794 if (!isDirect && !Subtarget->hasV5TOps())
2795 CallOpc = ARMISD::CALL_NOLINK;
2796 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2797 // Emit regular call when code size is the priority
2798 !Subtarget->hasMinSize())
2799 // "mov lr, pc; b _foo" to avoid confusing the RSP
2800 CallOpc = ARMISD::CALL_NOLINK;
2801 else
2802 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2803 }
2804
2805 // We don't usually want to end the call-sequence here because we would tidy
2806 // the frame up *after* the call, however in the ABI-changing tail-call case
2807 // we've carefully laid out the parameters so that when sp is reset they'll be
2808 // in the correct location.
2809 if (isTailCall && !isSibCall) {
2810 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2811 InGlue = Chain.getValue(1);
2812 }
2813
2814 std::vector<SDValue> Ops;
2815 Ops.push_back(Chain);
2816 Ops.push_back(Callee);
2817
2818 if (isTailCall) {
2819 Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32));
2820 }
2821
2822 // Add argument registers to the end of the list so that they are known live
2823 // into the call.
2824 for (const auto &[Reg, N] : RegsToPass)
2825 Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2826
2827 // Add a register mask operand representing the call-preserved registers.
2828 const uint32_t *Mask;
2829 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2830 if (isThisReturn) {
2831 // For 'this' returns, use the R0-preserving mask if applicable
2832 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2833 if (!Mask) {
2834 // Set isThisReturn to false if the calling convention is not one that
2835 // allows 'returned' to be modeled in this way, so LowerCallResult does
2836 // not try to pass 'this' straight through
2837 isThisReturn = false;
2838 Mask = ARI->getCallPreservedMask(MF, CallConv);
2839 }
2840 } else
2841 Mask = ARI->getCallPreservedMask(MF, CallConv);
2842
2843 assert(Mask && "Missing call preserved mask for calling convention");
2844 Ops.push_back(DAG.getRegisterMask(Mask));
2845
2846 if (InGlue.getNode())
2847 Ops.push_back(InGlue);
2848
2849 if (isTailCall) {
2851 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
2852 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2853 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2854 return Ret;
2855 }
2856
2857 // Returns a chain and a flag for retval copy to use.
2858 Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
2859 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2860 InGlue = Chain.getValue(1);
2861 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2862
2863 // If we're guaranteeing tail-calls will be honoured, the callee must
2864 // pop its own argument stack on return. But this call is *not* a tail call so
2865 // we need to undo that after it returns to restore the status-quo.
2866 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2867 uint64_t CalleePopBytes =
2868 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1U;
2869
2870 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2871 if (!Ins.empty())
2872 InGlue = Chain.getValue(1);
2873
2874 // Handle result values, copying them out of physregs into vregs that we
2875 // return.
2876 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2877 InVals, isThisReturn,
2878 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2879}
2880
2881/// HandleByVal - Every parameter *after* a byval parameter is passed
2882/// on the stack. Remember the next parameter register to allocate,
2883/// and then confiscate the rest of the parameter registers to insure
2884/// this.
2885void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2886 Align Alignment) const {
2887 // Byval (as with any stack) slots are always at least 4 byte aligned.
2888 Alignment = std::max(Alignment, Align(4));
2889
2890 MCRegister Reg = State->AllocateReg(GPRArgRegs);
2891 if (!Reg)
2892 return;
2893
2894 unsigned AlignInRegs = Alignment.value() / 4;
2895 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2896 for (unsigned i = 0; i < Waste; ++i)
2897 Reg = State->AllocateReg(GPRArgRegs);
2898
2899 if (!Reg)
2900 return;
2901
2902 unsigned Excess = 4 * (ARM::R4 - Reg);
2903
2904 // Special case when NSAA != SP and parameter size greater than size of
2905 // all remained GPR regs. In that case we can't split parameter, we must
2906 // send it to stack. We also must set NCRN to R4, so waste all
2907 // remained registers.
2908 const unsigned NSAAOffset = State->getStackSize();
2909 if (NSAAOffset != 0 && Size > Excess) {
2910 while (State->AllocateReg(GPRArgRegs))
2911 ;
2912 return;
2913 }
2914
2915 // First register for byval parameter is the first register that wasn't
2916 // allocated before this method call, so it would be "reg".
2917 // If parameter is small enough to be saved in range [reg, r4), then
2918 // the end (first after last) register would be reg + param-size-in-regs,
2919 // else parameter would be splitted between registers and stack,
2920 // end register would be r4 in this case.
2921 unsigned ByValRegBegin = Reg;
2922 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2923 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2924 // Note, first register is allocated in the beginning of function already,
2925 // allocate remained amount of registers we need.
2926 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2927 State->AllocateReg(GPRArgRegs);
2928 // A byval parameter that is split between registers and memory needs its
2929 // size truncated here.
2930 // In the case where the entire structure fits in registers, we set the
2931 // size in memory to zero.
2932 Size = std::max<int>(Size - Excess, 0);
2933}
2934
2935/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2936/// for tail call optimization. Targets which want to do tail call
2937/// optimization should implement this function. Note that this function also
2938/// processes musttail calls, so when this function returns false on a valid
2939/// musttail call, a fatal backend error occurs.
2940bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2942 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
2943 CallingConv::ID CalleeCC = CLI.CallConv;
2944 SDValue Callee = CLI.Callee;
2945 bool isVarArg = CLI.IsVarArg;
2946 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2947 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2948 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2949 const SelectionDAG &DAG = CLI.DAG;
2950 MachineFunction &MF = DAG.getMachineFunction();
2951 const Function &CallerF = MF.getFunction();
2952 CallingConv::ID CallerCC = CallerF.getCallingConv();
2953
2954 assert(Subtarget->supportsTailCall());
2955
2956 // Indirect tail-calls require a register to hold the target address. That
2957 // register must be:
2958 // * Allocatable (i.e. r0-r7 if the target is Thumb1).
2959 // * Not callee-saved, so must be one of r0-r3 or r12.
2960 // * Not used to hold an argument to the tail-called function, which might be
2961 // in r0-r3.
2962 // * Not used to hold the return address authentication code, which is in r12
2963 // if enabled.
2964 // Sometimes, no register matches all of these conditions, so we can't do a
2965 // tail-call.
2966 if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
2967 SmallSet<MCPhysReg, 5> AddressRegisters = {ARM::R0, ARM::R1, ARM::R2,
2968 ARM::R3};
2969 if (!(Subtarget->isThumb1Only() ||
2970 MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
2971 AddressRegisters.insert(ARM::R12);
2972 for (const CCValAssign &AL : ArgLocs)
2973 if (AL.isRegLoc())
2974 AddressRegisters.erase(AL.getLocReg());
2975 if (AddressRegisters.empty()) {
2976 LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n");
2977 return false;
2978 }
2979 }
2980
2981 // Look for obvious safe cases to perform tail call optimization that do not
2982 // require ABI changes. This is what gcc calls sibcall.
2983
2984 // Exception-handling functions need a special set of instructions to indicate
2985 // a return to the hardware. Tail-calling another function would probably
2986 // break this.
2987 if (CallerF.hasFnAttribute("interrupt")) {
2988 LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n");
2989 return false;
2990 }
2991
2992 if (canGuaranteeTCO(CalleeCC,
2993 getTargetMachine().Options.GuaranteedTailCallOpt)) {
2994 LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false")
2995 << " (guaranteed tail-call CC)\n");
2996 return CalleeCC == CallerCC;
2997 }
2998
2999 // Also avoid sibcall optimization if either caller or callee uses struct
3000 // return semantics.
3001 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
3002 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
3003 if (isCalleeStructRet != isCallerStructRet) {
3004 LLVM_DEBUG(dbgs() << "false (struct-ret)\n");
3005 return false;
3006 }
3007
3008 // Externally-defined functions with weak linkage should not be
3009 // tail-called on ARM when the OS does not support dynamic
3010 // pre-emption of symbols, as the AAELF spec requires normal calls
3011 // to undefined weak functions to be replaced with a NOP or jump to the
3012 // next instruction. The behaviour of branch instructions in this
3013 // situation (as used for tail calls) is implementation-defined, so we
3014 // cannot rely on the linker replacing the tail call with a return.
3015 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3016 const GlobalValue *GV = G->getGlobal();
3017 const Triple &TT = getTargetMachine().getTargetTriple();
3018 if (GV->hasExternalWeakLinkage() &&
3019 (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
3020 TT.isOSBinFormatMachO())) {
3021 LLVM_DEBUG(dbgs() << "false (external weak linkage)\n");
3022 return false;
3023 }
3024 }
3025
3026 // Check that the call results are passed in the same way.
3027 LLVMContext &C = *DAG.getContext();
3029 getEffectiveCallingConv(CalleeCC, isVarArg),
3030 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
3031 CCAssignFnForReturn(CalleeCC, isVarArg),
3032 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) {
3033 LLVM_DEBUG(dbgs() << "false (incompatible results)\n");
3034 return false;
3035 }
3036 // The callee has to preserve all registers the caller needs to preserve.
3037 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3038 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3039 if (CalleeCC != CallerCC) {
3040 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3041 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) {
3042 LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n");
3043 return false;
3044 }
3045 }
3046
3047 // If Caller's vararg argument has been split between registers and stack, do
3048 // not perform tail call, since part of the argument is in caller's local
3049 // frame.
3050 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
3051 if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
3052 LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
3053 return false;
3054 }
3055
3056 // If the callee takes no arguments then go on to check the results of the
3057 // call.
3058 const MachineRegisterInfo &MRI = MF.getRegInfo();
3059 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
3060 LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
3061 return false;
3062 }
3063
3064 // If the stack arguments for this call do not fit into our own save area then
3065 // the call cannot be made tail.
3066 if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
3067 return false;
3068
3069 LLVM_DEBUG(dbgs() << "true\n");
3070 return true;
3071}
3072
3073bool
3074ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
3075 MachineFunction &MF, bool isVarArg,
3077 LLVMContext &Context, const Type *RetTy) const {
3079 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3080 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3081}
3082
3084 const SDLoc &DL, SelectionDAG &DAG) {
3085 const MachineFunction &MF = DAG.getMachineFunction();
3086 const Function &F = MF.getFunction();
3087
3088 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
3089
3090 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3091 // version of the "preferred return address". These offsets affect the return
3092 // instruction if this is a return from PL1 without hypervisor extensions.
3093 // IRQ/FIQ: +4 "subs pc, lr, #4"
3094 // SWI: 0 "subs pc, lr, #0"
3095 // ABORT: +4 "subs pc, lr, #4"
3096 // UNDEF: +4/+2 "subs pc, lr, #0"
3097 // UNDEF varies depending on where the exception came from ARM or Thumb
3098 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3099
3100 int64_t LROffset;
3101 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
3102 IntKind == "ABORT")
3103 LROffset = 4;
3104 else if (IntKind == "SWI" || IntKind == "UNDEF")
3105 LROffset = 0;
3106 else
3107 report_fatal_error("Unsupported interrupt attribute. If present, value "
3108 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3109
3110 RetOps.insert(RetOps.begin() + 1,
3111 DAG.getConstant(LROffset, DL, MVT::i32, false));
3112
3113 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
3114}
3115
3116SDValue
3117ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3118 bool isVarArg,
3120 const SmallVectorImpl<SDValue> &OutVals,
3121 const SDLoc &dl, SelectionDAG &DAG) const {
3122 // CCValAssign - represent the assignment of the return value to a location.
3124
3125 // CCState - Info about the registers and stack slots.
3126 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3127 *DAG.getContext());
3128
3129 // Analyze outgoing return values.
3130 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3131
3132 SDValue Glue;
3134 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3135 bool isLittleEndian = Subtarget->isLittle();
3136
3137 MachineFunction &MF = DAG.getMachineFunction();
3138 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3139 AFI->setReturnRegsCount(RVLocs.size());
3140
3141 // Report error if cmse entry function returns structure through first ptr arg.
3142 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
3143 // Note: using an empty SDLoc(), as the first line of the function is a
3144 // better place to report than the last line.
3145 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
3147 "secure entry function would return value through pointer",
3148 SDLoc().getDebugLoc()));
3149 }
3150
3151 // Copy the result values into the output registers.
3152 for (unsigned i = 0, realRVLocIdx = 0;
3153 i != RVLocs.size();
3154 ++i, ++realRVLocIdx) {
3155 CCValAssign &VA = RVLocs[i];
3156 assert(VA.isRegLoc() && "Can only return in registers!");
3157
3158 SDValue Arg = OutVals[realRVLocIdx];
3159 bool ReturnF16 = false;
3160
3161 if (Subtarget->hasFullFP16() && getTM().isTargetHardFloat()) {
3162 // Half-precision return values can be returned like this:
3163 //
3164 // t11 f16 = fadd ...
3165 // t12: i16 = bitcast t11
3166 // t13: i32 = zero_extend t12
3167 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3168 //
3169 // to avoid code generation for bitcasts, we simply set Arg to the node
3170 // that produces the f16 value, t11 in this case.
3171 //
3172 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3173 SDValue ZE = Arg.getOperand(0);
3174 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3175 SDValue BC = ZE.getOperand(0);
3176 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3177 Arg = BC.getOperand(0);
3178 ReturnF16 = true;
3179 }
3180 }
3181 }
3182 }
3183
3184 switch (VA.getLocInfo()) {
3185 default: llvm_unreachable("Unknown loc info!");
3186 case CCValAssign::Full: break;
3187 case CCValAssign::BCvt:
3188 if (!ReturnF16)
3189 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3190 break;
3191 }
3192
3193 // Mask f16 arguments if this is a CMSE nonsecure entry.
3194 auto RetVT = Outs[realRVLocIdx].ArgVT;
3195 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3196 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3197 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3198 } else {
3199 auto LocBits = VA.getLocVT().getSizeInBits();
3200 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3201 SDValue Mask =
3202 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
3203 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
3204 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3205 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3206 }
3207 }
3208
3209 if (VA.needsCustom() &&
3210 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3211 if (VA.getLocVT() == MVT::v2f64) {
3212 // Extract the first half and return it in two registers.
3213 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3214 DAG.getConstant(0, dl, MVT::i32));
3215 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3216 DAG.getVTList(MVT::i32, MVT::i32), Half);
3217
3218 Chain =
3219 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3220 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3221 Glue = Chain.getValue(1);
3222 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3223 VA = RVLocs[++i]; // skip ahead to next loc
3224 Chain =
3225 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3226 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3227 Glue = Chain.getValue(1);
3228 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3229 VA = RVLocs[++i]; // skip ahead to next loc
3230
3231 // Extract the 2nd half and fall through to handle it as an f64 value.
3232 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3233 DAG.getConstant(1, dl, MVT::i32));
3234 }
3235 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3236 // available.
3237 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3238 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3239 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3240 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3241 Glue = Chain.getValue(1);
3242 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3243 VA = RVLocs[++i]; // skip ahead to next loc
3244 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3245 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3246 } else
3247 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3248
3249 // Guarantee that all emitted copies are
3250 // stuck together, avoiding something bad.
3251 Glue = Chain.getValue(1);
3252 RetOps.push_back(DAG.getRegister(
3253 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3254 }
3255 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3256 const MCPhysReg *I =
3257 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3258 if (I) {
3259 for (; *I; ++I) {
3260 if (ARM::GPRRegClass.contains(*I))
3261 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3262 else if (ARM::DPRRegClass.contains(*I))
3264 else
3265 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3266 }
3267 }
3268
3269 // Update chain and glue.
3270 RetOps[0] = Chain;
3271 if (Glue.getNode())
3272 RetOps.push_back(Glue);
3273
3274 // CPUs which aren't M-class use a special sequence to return from
3275 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3276 // though we use "subs pc, lr, #N").
3277 //
3278 // M-class CPUs actually use a normal return sequence with a special
3279 // (hardware-provided) value in LR, so the normal code path works.
3280 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3281 !Subtarget->isMClass()) {
3282 if (Subtarget->isThumb1Only())
3283 report_fatal_error("interrupt attribute is not supported in Thumb1");
3284 return LowerInterruptReturn(RetOps, dl, DAG);
3285 }
3286
3289 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3290}
3291
3292bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3293 if (N->getNumValues() != 1)
3294 return false;
3295 if (!N->hasNUsesOfValue(1, 0))
3296 return false;
3297
3298 SDValue TCChain = Chain;
3299 SDNode *Copy = *N->user_begin();
3300 if (Copy->getOpcode() == ISD::CopyToReg) {
3301 // If the copy has a glue operand, we conservatively assume it isn't safe to
3302 // perform a tail call.
3303 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3304 return false;
3305 TCChain = Copy->getOperand(0);
3306 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3307 SDNode *VMov = Copy;
3308 // f64 returned in a pair of GPRs.
3309 SmallPtrSet<SDNode*, 2> Copies;
3310 for (SDNode *U : VMov->users()) {
3311 if (U->getOpcode() != ISD::CopyToReg)
3312 return false;
3313 Copies.insert(U);
3314 }
3315 if (Copies.size() > 2)
3316 return false;
3317
3318 for (SDNode *U : VMov->users()) {
3319 SDValue UseChain = U->getOperand(0);
3320 if (Copies.count(UseChain.getNode()))
3321 // Second CopyToReg
3322 Copy = U;
3323 else {
3324 // We are at the top of this chain.
3325 // If the copy has a glue operand, we conservatively assume it
3326 // isn't safe to perform a tail call.
3327 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3328 return false;
3329 // First CopyToReg
3330 TCChain = UseChain;
3331 }
3332 }
3333 } else if (Copy->getOpcode() == ISD::BITCAST) {
3334 // f32 returned in a single GPR.
3335 if (!Copy->hasOneUse())
3336 return false;
3337 Copy = *Copy->user_begin();
3338 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3339 return false;
3340 // If the copy has a glue operand, we conservatively assume it isn't safe to
3341 // perform a tail call.
3342 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3343 return false;
3344 TCChain = Copy->getOperand(0);
3345 } else {
3346 return false;
3347 }
3348
3349 bool HasRet = false;
3350 for (const SDNode *U : Copy->users()) {
3351 if (U->getOpcode() != ARMISD::RET_GLUE &&
3352 U->getOpcode() != ARMISD::INTRET_GLUE)
3353 return false;
3354 HasRet = true;
3355 }
3356
3357 if (!HasRet)
3358 return false;
3359
3360 Chain = TCChain;
3361 return true;
3362}
3363
3364bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3365 if (!Subtarget->supportsTailCall())
3366 return false;
3367
3368 if (!CI->isTailCall())
3369 return false;
3370
3371 return true;
3372}
3373
3374// Trying to write a 64 bit value so need to split into two 32 bit values first,
3375// and pass the lower and high parts through.
3377 SDLoc DL(Op);
3378 SDValue WriteValue = Op->getOperand(2);
3379
3380 // This function is only supposed to be called for i64 type argument.
3381 assert(WriteValue.getValueType() == MVT::i64
3382 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3383
3384 SDValue Lo, Hi;
3385 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3386 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3387 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3388}
3389
3390// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3391// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3392// one of the above mentioned nodes. It has to be wrapped because otherwise
3393// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3394// be used to form addressing mode. These wrapped nodes will be selected
3395// into MOVi.
3396SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3397 SelectionDAG &DAG) const {
3398 EVT PtrVT = Op.getValueType();
3399 // FIXME there is no actual debug info here
3400 SDLoc dl(Op);
3401 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3402 SDValue Res;
3403
3404 // When generating execute-only code Constant Pools must be promoted to the
3405 // global data section. It's a bit ugly that we can't share them across basic
3406 // blocks, but this way we guarantee that execute-only behaves correct with
3407 // position-independent addressing modes.
3408 if (Subtarget->genExecuteOnly()) {
3409 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3410 auto *T = CP->getType();
3411 auto C = const_cast<Constant*>(CP->getConstVal());
3412 auto M = DAG.getMachineFunction().getFunction().getParent();
3413 auto GV = new GlobalVariable(
3414 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3415 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
3416 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
3417 Twine(AFI->createPICLabelUId())
3418 );
3420 dl, PtrVT);
3421 return LowerGlobalAddress(GA, DAG);
3422 }
3423
3424 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3425 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3426 Align CPAlign = CP->getAlign();
3427 if (Subtarget->isThumb1Only())
3428 CPAlign = std::max(CPAlign, Align(4));
3429 if (CP->isMachineConstantPoolEntry())
3430 Res =
3431 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3432 else
3433 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3434 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3435}
3436
3438 // If we don't have a 32-bit pc-relative branch instruction then the jump
3439 // table consists of block addresses. Usually this is inline, but for
3440 // execute-only it must be placed out-of-line.
3441 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3444}
3445
3446SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3447 SelectionDAG &DAG) const {
3450 unsigned ARMPCLabelIndex = 0;
3451 SDLoc DL(Op);
3452 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3453 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3454 SDValue CPAddr;
3455 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3456 if (!IsPositionIndependent) {
3457 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3458 } else {
3459 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3460 ARMPCLabelIndex = AFI->createPICLabelUId();
3462 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3463 ARMCP::CPBlockAddress, PCAdj);
3464 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3465 }
3466 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3467 SDValue Result = DAG.getLoad(
3468 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3470 if (!IsPositionIndependent)
3471 return Result;
3472 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3473 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3474}
3475
3476/// Convert a TLS address reference into the correct sequence of loads
3477/// and calls to compute the variable's address for Darwin, and return an
3478/// SDValue containing the final node.
3479
3480/// Darwin only has one TLS scheme which must be capable of dealing with the
3481/// fully general situation, in the worst case. This means:
3482/// + "extern __thread" declaration.
3483/// + Defined in a possibly unknown dynamic library.
3484///
3485/// The general system is that each __thread variable has a [3 x i32] descriptor
3486/// which contains information used by the runtime to calculate the address. The
3487/// only part of this the compiler needs to know about is the first word, which
3488/// contains a function pointer that must be called with the address of the
3489/// entire descriptor in "r0".
3490///
3491/// Since this descriptor may be in a different unit, in general access must
3492/// proceed along the usual ARM rules. A common sequence to produce is:
3493///
3494/// movw rT1, :lower16:_var$non_lazy_ptr
3495/// movt rT1, :upper16:_var$non_lazy_ptr
3496/// ldr r0, [rT1]
3497/// ldr rT2, [r0]
3498/// blx rT2
3499/// [...address now in r0...]
3500SDValue
3501ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3502 SelectionDAG &DAG) const {
3503 assert(Subtarget->isTargetDarwin() &&
3504 "This function expects a Darwin target");
3505 SDLoc DL(Op);
3506
3507 // First step is to get the address of the actua global symbol. This is where
3508 // the TLS descriptor lives.
3509 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3510
3511 // The first entry in the descriptor is a function pointer that we must call
3512 // to obtain the address of the variable.
3513 SDValue Chain = DAG.getEntryNode();
3514 SDValue FuncTLVGet = DAG.getLoad(
3515 MVT::i32, DL, Chain, DescAddr,
3519 Chain = FuncTLVGet.getValue(1);
3520
3521 MachineFunction &F = DAG.getMachineFunction();
3522 MachineFrameInfo &MFI = F.getFrameInfo();
3523 MFI.setAdjustsStack(true);
3524
3525 // TLS calls preserve all registers except those that absolutely must be
3526 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3527 // silly).
3528 auto TRI =
3530 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3531 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3532
3533 // Finally, we can make the call. This is just a degenerate version of a
3534 // normal AArch64 call node: r0 takes the address of the descriptor, and
3535 // returns the address of the variable in this thread.
3536 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3537 Chain =
3538 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3539 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3540 DAG.getRegisterMask(Mask), Chain.getValue(1));
3541 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3542}
3543
3544SDValue
3545ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3546 SelectionDAG &DAG) const {
3547 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3548
3549 SDValue Chain = DAG.getEntryNode();
3550 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3551 SDLoc DL(Op);
3552
3553 // Load the current TEB (thread environment block)
3554 SDValue Ops[] = {Chain,
3555 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3556 DAG.getTargetConstant(15, DL, MVT::i32),
3557 DAG.getTargetConstant(0, DL, MVT::i32),
3558 DAG.getTargetConstant(13, DL, MVT::i32),
3559 DAG.getTargetConstant(0, DL, MVT::i32),
3560 DAG.getTargetConstant(2, DL, MVT::i32)};
3561 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3562 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3563
3564 SDValue TEB = CurrentTEB.getValue(0);
3565 Chain = CurrentTEB.getValue(1);
3566
3567 // Load the ThreadLocalStoragePointer from the TEB
3568 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3569 SDValue TLSArray =
3570 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3571 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3572
3573 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3574 // offset into the TLSArray.
3575
3576 // Load the TLS index from the C runtime
3577 SDValue TLSIndex =
3578 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3579 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3580 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3581
3582 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3583 DAG.getConstant(2, DL, MVT::i32));
3584 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3585 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3586 MachinePointerInfo());
3587
3588 // Get the offset of the start of the .tls section (section base)
3589 const auto *GA = cast<GlobalAddressSDNode>(Op);
3590 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3591 SDValue Offset = DAG.getLoad(
3592 PtrVT, DL, Chain,
3593 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3594 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3596
3597 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3598}
3599
3600// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3601SDValue
3602ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3603 SelectionDAG &DAG) const {
3604 SDLoc dl(GA);
3605 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3606 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3607 MachineFunction &MF = DAG.getMachineFunction();
3608 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3609 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3610 ARMConstantPoolValue *CPV =
3611 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3612 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3613 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3614 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3615 Argument = DAG.getLoad(
3616 PtrVT, dl, DAG.getEntryNode(), Argument,
3618 SDValue Chain = Argument.getValue(1);
3619
3620 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3621 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3622
3623 // call __tls_get_addr.
3625 Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
3626
3627 // FIXME: is there useful debug info available here?
3628 TargetLowering::CallLoweringInfo CLI(DAG);
3629 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3631 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3632
3633 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3634 return CallResult.first;
3635}
3636
3637// Lower ISD::GlobalTLSAddress using the "initial exec" or
3638// "local exec" model.
3639SDValue
3640ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3641 SelectionDAG &DAG,
3642 TLSModel::Model model) const {
3643 const GlobalValue *GV = GA->getGlobal();
3644 SDLoc dl(GA);
3646 SDValue Chain = DAG.getEntryNode();
3647 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3648 // Get the Thread Pointer
3650
3651 if (model == TLSModel::InitialExec) {
3652 MachineFunction &MF = DAG.getMachineFunction();
3653 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3654 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3655 // Initial exec model.
3656 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3657 ARMConstantPoolValue *CPV =
3658 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3660 true);
3661 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3662 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3663 Offset = DAG.getLoad(
3664 PtrVT, dl, Chain, Offset,
3666 Chain = Offset.getValue(1);
3667
3668 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3669 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3670
3671 Offset = DAG.getLoad(
3672 PtrVT, dl, Chain, Offset,
3674 } else {
3675 // local exec model
3676 assert(model == TLSModel::LocalExec);
3677 ARMConstantPoolValue *CPV =
3679 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3680 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3681 Offset = DAG.getLoad(
3682 PtrVT, dl, Chain, Offset,
3684 }
3685
3686 // The address of the thread local variable is the add of the thread
3687 // pointer with the offset of the variable.
3688 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3689}
3690
3691SDValue
3692ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3693 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3694 if (DAG.getTarget().useEmulatedTLS())
3695 return LowerToTLSEmulatedModel(GA, DAG);
3696
3697 if (Subtarget->isTargetDarwin())
3698 return LowerGlobalTLSAddressDarwin(Op, DAG);
3699
3700 if (Subtarget->isTargetWindows())
3701 return LowerGlobalTLSAddressWindows(Op, DAG);
3702
3703 // TODO: implement the "local dynamic" model
3704 assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3706
3707 switch (model) {
3710 return LowerToTLSGeneralDynamicModel(GA, DAG);
3713 return LowerToTLSExecModels(GA, DAG, model);
3714 }
3715 llvm_unreachable("bogus TLS model");
3716}
3717
3718/// Return true if all users of V are within function F, looking through
3719/// ConstantExprs.
3720static bool allUsersAreInFunction(const Value *V, const Function *F) {
3721 SmallVector<const User*,4> Worklist(V->users());
3722 while (!Worklist.empty()) {
3723 auto *U = Worklist.pop_back_val();
3724 if (isa<ConstantExpr>(U)) {
3725 append_range(Worklist, U->users());
3726 continue;
3727 }
3728
3729 auto *I = dyn_cast<Instruction>(U);
3730 if (!I || I->getParent()->getParent() != F)
3731 return false;
3732 }
3733 return true;
3734}
3735
3737 const GlobalValue *GV, SelectionDAG &DAG,
3738 EVT PtrVT, const SDLoc &dl) {
3739 // If we're creating a pool entry for a constant global with unnamed address,
3740 // and the global is small enough, we can emit it inline into the constant pool
3741 // to save ourselves an indirection.
3742 //
3743 // This is a win if the constant is only used in one function (so it doesn't
3744 // need to be duplicated) or duplicating the constant wouldn't increase code
3745 // size (implying the constant is no larger than 4 bytes).
3746 const Function &F = DAG.getMachineFunction().getFunction();
3747
3748 // We rely on this decision to inline being idemopotent and unrelated to the
3749 // use-site. We know that if we inline a variable at one use site, we'll
3750 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3751 // doesn't know about this optimization, so bail out if it's enabled else
3752 // we could decide to inline here (and thus never emit the GV) but require
3753 // the GV from fast-isel generated code.
3756 return SDValue();
3757
3758 auto *GVar = dyn_cast<GlobalVariable>(GV);
3759 if (!GVar || !GVar->hasInitializer() ||
3760 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3761 !GVar->hasLocalLinkage())
3762 return SDValue();
3763
3764 // If we inline a value that contains relocations, we move the relocations
3765 // from .data to .text. This is not allowed in position-independent code.
3766 auto *Init = GVar->getInitializer();
3767 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3768 Init->needsDynamicRelocation())
3769 return SDValue();
3770
3771 // The constant islands pass can only really deal with alignment requests
3772 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3773 // any type wanting greater alignment requirements than 4 bytes. We also
3774 // can only promote constants that are multiples of 4 bytes in size or
3775 // are paddable to a multiple of 4. Currently we only try and pad constants
3776 // that are strings for simplicity.
3777 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3778 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3779 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3780 unsigned RequiredPadding = 4 - (Size % 4);
3781 bool PaddingPossible =
3782 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3783 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3784 Size == 0)
3785 return SDValue();
3786
3787 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3789 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3790
3791 // We can't bloat the constant pool too much, else the ConstantIslands pass
3792 // may fail to converge. If we haven't promoted this global yet (it may have
3793 // multiple uses), and promoting it would increase the constant pool size (Sz
3794 // > 4), ensure we have space to do so up to MaxTotal.
3795 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3796 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3798 return SDValue();
3799
3800 // This is only valid if all users are in a single function; we can't clone
3801 // the constant in general. The LLVM IR unnamed_addr allows merging
3802 // constants, but not cloning them.
3803 //
3804 // We could potentially allow cloning if we could prove all uses of the
3805 // constant in the current function don't care about the address, like
3806 // printf format strings. But that isn't implemented for now.
3807 if (!allUsersAreInFunction(GVar, &F))
3808 return SDValue();
3809
3810 // We're going to inline this global. Pad it out if needed.
3811 if (RequiredPadding != 4) {
3812 StringRef S = CDAInit->getAsString();
3813
3815 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3816 while (RequiredPadding--)
3817 V.push_back(0);
3819 }
3820
3821 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3822 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3823 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3826 PaddedSize - 4);
3827 }
3828 ++NumConstpoolPromoted;
3829 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3830}
3831
3833 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3834 if (!(GV = GA->getAliaseeObject()))
3835 return false;
3836 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3837 return V->isConstant();
3838 return isa<Function>(GV);
3839}
3840
3841SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3842 SelectionDAG &DAG) const {
3843 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3844 default: llvm_unreachable("unknown object format");
3845 case Triple::COFF:
3846 return LowerGlobalAddressWindows(Op, DAG);
3847 case Triple::ELF:
3848 return LowerGlobalAddressELF(Op, DAG);
3849 case Triple::MachO:
3850 return LowerGlobalAddressDarwin(Op, DAG);
3851 }
3852}
3853
3854SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3855 SelectionDAG &DAG) const {
3856 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3857 SDLoc dl(Op);
3858 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3859 bool IsRO = isReadOnly(GV);
3860
3861 // promoteToConstantPool only if not generating XO text section
3862 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3863 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3864 return V;
3865
3866 if (isPositionIndependent()) {
3868 GV, dl, PtrVT, 0, GV->isDSOLocal() ? 0 : ARMII::MO_GOT);
3869 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3870 if (!GV->isDSOLocal())
3871 Result =
3872 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3874 return Result;
3875 } else if (Subtarget->isROPI() && IsRO) {
3876 // PC-relative.
3877 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3878 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3879 return Result;
3880 } else if (Subtarget->isRWPI() && !IsRO) {
3881 // SB-relative.
3882 SDValue RelAddr;
3883 if (Subtarget->useMovt()) {
3884 ++NumMovwMovt;
3885 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3886 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3887 } else { // use literal pool for address constant
3888 ARMConstantPoolValue *CPV =
3890 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3891 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3892 RelAddr = DAG.getLoad(
3893 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3895 }
3896 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3897 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3898 return Result;
3899 }
3900
3901 // If we have T2 ops, we can materialize the address directly via movt/movw
3902 // pair. This is always cheaper. If need to generate Execute Only code, and we
3903 // only have Thumb1 available, we can't use a constant pool and are forced to
3904 // use immediate relocations.
3905 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3906 if (Subtarget->useMovt())
3907 ++NumMovwMovt;
3908 // FIXME: Once remat is capable of dealing with instructions with register
3909 // operands, expand this into two nodes.
3910 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3911 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3912 } else {
3913 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3914 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3915 return DAG.getLoad(
3916 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3918 }
3919}
3920
3921SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3922 SelectionDAG &DAG) const {
3923 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3924 "ROPI/RWPI not currently supported for Darwin");
3925 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3926 SDLoc dl(Op);
3927 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3928
3929 if (Subtarget->useMovt())
3930 ++NumMovwMovt;
3931
3932 // FIXME: Once remat is capable of dealing with instructions with register
3933 // operands, expand this into multiple nodes
3934 unsigned Wrapper =
3936
3937 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3938 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3939
3940 if (Subtarget->isGVIndirectSymbol(GV))
3941 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3943 return Result;
3944}
3945
3946SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3947 SelectionDAG &DAG) const {
3948 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3949 assert(Subtarget->useMovt() &&
3950 "Windows on ARM expects to use movw/movt");
3951 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3952 "ROPI/RWPI not currently supported for Windows");
3953
3954 const TargetMachine &TM = getTargetMachine();
3955 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3956 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3957 if (GV->hasDLLImportStorageClass())
3958 TargetFlags = ARMII::MO_DLLIMPORT;
3959 else if (!TM.shouldAssumeDSOLocal(GV))
3960 TargetFlags = ARMII::MO_COFFSTUB;
3961 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3963 SDLoc DL(Op);
3964
3965 ++NumMovwMovt;
3966
3967 // FIXME: Once remat is capable of dealing with instructions with register
3968 // operands, expand this into two nodes.
3969 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3970 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3971 TargetFlags));
3972 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3973 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3975 return Result;
3976}
3977
3978SDValue
3979ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3980 SDLoc dl(Op);
3981 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3982 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3983 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3984 Op.getOperand(1), Val);
3985}
3986
3987SDValue
3988ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3989 SDLoc dl(Op);
3990 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3991 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3992}
3993
3994SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3995 SelectionDAG &DAG) const {
3996 SDLoc dl(Op);
3997 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3998 Op.getOperand(0));
3999}
4000
4001SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
4002 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
4003 unsigned IntNo =
4004 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
4005 switch (IntNo) {
4006 default:
4007 return SDValue(); // Don't custom lower most intrinsics.
4008 case Intrinsic::arm_gnu_eabi_mcount: {
4009 MachineFunction &MF = DAG.getMachineFunction();
4010 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4011 SDLoc dl(Op);
4012 SDValue Chain = Op.getOperand(0);
4013 // call "\01__gnu_mcount_nc"
4014 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
4015 const uint32_t *Mask =
4017 assert(Mask && "Missing call preserved mask for calling convention");
4018 // Mark LR an implicit live-in.
4019 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
4020 SDValue ReturnAddress =
4021 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
4022 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
4023 SDValue Callee =
4024 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
4026 if (Subtarget->isThumb())
4027 return SDValue(
4028 DAG.getMachineNode(
4029 ARM::tBL_PUSHLR, dl, ResultTys,
4030 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
4031 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
4032 0);
4033 return SDValue(
4034 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
4035 {ReturnAddress, Callee, RegisterMask, Chain}),
4036 0);
4037 }
4038 }
4039}
4040
4041SDValue
4042ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
4043 const ARMSubtarget *Subtarget) const {
4044 unsigned IntNo = Op.getConstantOperandVal(0);
4045 SDLoc dl(Op);
4046 switch (IntNo) {
4047 default: return SDValue(); // Don't custom lower most intrinsics.
4048 case Intrinsic::thread_pointer: {
4049 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4050 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
4051 }
4052 case Intrinsic::arm_cls: {
4053 const SDValue &Operand = Op.getOperand(1);
4054 const EVT VTy = Op.getValueType();
4055 SDValue SRA =
4056 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
4057 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
4058 SDValue SHL =
4059 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
4060 SDValue OR =
4061 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
4062 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
4063 return Result;
4064 }
4065 case Intrinsic::arm_cls64: {
4066 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
4067 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
4068 const SDValue &Operand = Op.getOperand(1);
4069 const EVT VTy = Op.getValueType();
4070 SDValue Lo, Hi;
4071 std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VTy, VTy);
4072 SDValue Constant0 = DAG.getConstant(0, dl, VTy);
4073 SDValue Constant1 = DAG.getConstant(1, dl, VTy);
4074 SDValue Constant31 = DAG.getConstant(31, dl, VTy);
4075 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
4076 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
4077 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
4078 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
4079 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
4080 SDValue CheckLo =
4081 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
4082 SDValue HiIsZero =
4083 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
4084 SDValue AdjustedLo =
4085 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
4086 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
4087 SDValue Result =
4088 DAG.getSelect(dl, VTy, CheckLo,
4089 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
4090 return Result;
4091 }
4092 case Intrinsic::eh_sjlj_lsda: {
4093 MachineFunction &MF = DAG.getMachineFunction();
4094 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4095 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
4096 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4097 SDValue CPAddr;
4098 bool IsPositionIndependent = isPositionIndependent();
4099 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
4100 ARMConstantPoolValue *CPV =
4101 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
4102 ARMCP::CPLSDA, PCAdj);
4103 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
4104 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
4105 SDValue Result = DAG.getLoad(
4106 PtrVT, dl, DAG.getEntryNode(), CPAddr,
4108
4109 if (IsPositionIndependent) {
4110 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
4111 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
4112 }
4113 return Result;
4114 }
4115 case Intrinsic::arm_neon_vabs:
4116 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
4117 Op.getOperand(1));
4118 case Intrinsic::arm_neon_vabds:
4119 if (Op.getValueType().isInteger())
4120 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
4121 Op.getOperand(1), Op.getOperand(2));
4122 return SDValue();
4123 case Intrinsic::arm_neon_vabdu:
4124 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
4125 Op.getOperand(1), Op.getOperand(2));
4126 case Intrinsic::arm_neon_vmulls:
4127 case Intrinsic::arm_neon_vmullu: {
4128 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
4129 ? ARMISD::VMULLs : ARMISD::VMULLu;
4130 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4131 Op.getOperand(1), Op.getOperand(2));
4132 }
4133 case Intrinsic::arm_neon_vminnm:
4134 case Intrinsic::arm_neon_vmaxnm: {
4135 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
4136 ? ISD::FMINNUM : ISD::FMAXNUM;
4137 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4138 Op.getOperand(1), Op.getOperand(2));
4139 }
4140 case Intrinsic::arm_neon_vminu:
4141 case Intrinsic::arm_neon_vmaxu: {
4142 if (Op.getValueType().isFloatingPoint())
4143 return SDValue();
4144 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
4145 ? ISD::UMIN : ISD::UMAX;
4146 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4147 Op.getOperand(1), Op.getOperand(2));
4148 }
4149 case Intrinsic::arm_neon_vmins:
4150 case Intrinsic::arm_neon_vmaxs: {
4151 // v{min,max}s is overloaded between signed integers and floats.
4152 if (!Op.getValueType().isFloatingPoint()) {
4153 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4154 ? ISD::SMIN : ISD::SMAX;
4155 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4156 Op.getOperand(1), Op.getOperand(2));
4157 }
4158 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4159 ? ISD::FMINIMUM : ISD::FMAXIMUM;
4160 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4161 Op.getOperand(1), Op.getOperand(2));
4162 }
4163 case Intrinsic::arm_neon_vtbl1:
4164 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
4165 Op.getOperand(1), Op.getOperand(2));
4166 case Intrinsic::arm_neon_vtbl2:
4167 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
4168 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4169 case Intrinsic::arm_mve_pred_i2v:
4170 case Intrinsic::arm_mve_pred_v2i:
4171 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
4172 Op.getOperand(1));
4173 case Intrinsic::arm_mve_vreinterpretq:
4174 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
4175 Op.getOperand(1));
4176 case Intrinsic::arm_mve_lsll:
4177 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
4178 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4179 case Intrinsic::arm_mve_asrl:
4180 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
4181 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4182 }
4183}
4184
4186 const ARMSubtarget *Subtarget) {
4187 SDLoc dl(Op);
4188 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
4189 if (SSID == SyncScope::SingleThread)
4190 return Op;
4191
4192 if (!Subtarget->hasDataBarrier()) {
4193 // Some ARMv6 cpus can support data barriers with an mcr instruction.
4194 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
4195 // here.
4196 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
4197 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
4198 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
4199 DAG.getConstant(0, dl, MVT::i32));
4200 }
4201
4202 AtomicOrdering Ord =
4203 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
4205 if (Subtarget->isMClass()) {
4206 // Only a full system barrier exists in the M-class architectures.
4208 } else if (Subtarget->preferISHSTBarriers() &&
4209 Ord == AtomicOrdering::Release) {
4210 // Swift happens to implement ISHST barriers in a way that's compatible with
4211 // Release semantics but weaker than ISH so we'd be fools not to use
4212 // it. Beware: other processors probably don't!
4214 }
4215
4216 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4217 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4218 DAG.getConstant(Domain, dl, MVT::i32));
4219}
4220
4222 const ARMSubtarget *Subtarget) {
4223 // ARM pre v5TE and Thumb1 does not have preload instructions.
4224 if (!(Subtarget->isThumb2() ||
4225 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4226 // Just preserve the chain.
4227 return Op.getOperand(0);
4228
4229 SDLoc dl(Op);
4230 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4231 if (!isRead &&
4232 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4233 // ARMv7 with MP extension has PLDW.
4234 return Op.getOperand(0);
4235
4236 unsigned isData = Op.getConstantOperandVal(4);
4237 if (Subtarget->isThumb()) {
4238 // Invert the bits.
4239 isRead = ~isRead & 1;
4240 isData = ~isData & 1;
4241 }
4242
4243 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4244 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4245 DAG.getConstant(isData, dl, MVT::i32));
4246}
4247
4250 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4251
4252 // vastart just stores the address of the VarArgsFrameIndex slot into the
4253 // memory location argument.
4254 SDLoc dl(Op);
4256 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4257 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4258 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4259 MachinePointerInfo(SV));
4260}
4261
4262SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4263 CCValAssign &NextVA,
4264 SDValue &Root,
4265 SelectionDAG &DAG,
4266 const SDLoc &dl) const {
4267 MachineFunction &MF = DAG.getMachineFunction();
4268 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4269
4270 const TargetRegisterClass *RC;
4271 if (AFI->isThumb1OnlyFunction())
4272 RC = &ARM::tGPRRegClass;
4273 else
4274 RC = &ARM::GPRRegClass;
4275
4276 // Transform the arguments stored in physical registers into virtual ones.
4277 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4278 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4279
4280 SDValue ArgValue2;
4281 if (NextVA.isMemLoc()) {
4282 MachineFrameInfo &MFI = MF.getFrameInfo();
4283 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4284
4285 // Create load node to retrieve arguments from the stack.
4286 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4287 ArgValue2 = DAG.getLoad(
4288 MVT::i32, dl, Root, FIN,
4290 } else {
4291 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4292 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4293 }
4294 if (!Subtarget->isLittle())
4295 std::swap (ArgValue, ArgValue2);
4296 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4297}
4298
4299// The remaining GPRs hold either the beginning of variable-argument
4300// data, or the beginning of an aggregate passed by value (usually
4301// byval). Either way, we allocate stack slots adjacent to the data
4302// provided by our caller, and store the unallocated registers there.
4303// If this is a variadic function, the va_list pointer will begin with
4304// these values; otherwise, this reassembles a (byval) structure that
4305// was split between registers and memory.
4306// Return: The frame index registers were stored into.
4307int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4308 const SDLoc &dl, SDValue &Chain,
4309 const Value *OrigArg,
4310 unsigned InRegsParamRecordIdx,
4311 int ArgOffset, unsigned ArgSize) const {
4312 // Currently, two use-cases possible:
4313 // Case #1. Non-var-args function, and we meet first byval parameter.
4314 // Setup first unallocated register as first byval register;
4315 // eat all remained registers
4316 // (these two actions are performed by HandleByVal method).
4317 // Then, here, we initialize stack frame with
4318 // "store-reg" instructions.
4319 // Case #2. Var-args function, that doesn't contain byval parameters.
4320 // The same: eat all remained unallocated registers,
4321 // initialize stack frame.
4322
4323 MachineFunction &MF = DAG.getMachineFunction();
4324 MachineFrameInfo &MFI = MF.getFrameInfo();
4325 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4326 unsigned RBegin, REnd;
4327 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4328 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4329 } else {
4330 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4331 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4332 REnd = ARM::R4;
4333 }
4334
4335 if (REnd != RBegin)
4336 ArgOffset = -4 * (ARM::R4 - RBegin);
4337
4338 auto PtrVT = getPointerTy(DAG.getDataLayout());
4339 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4340 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4341
4343 const TargetRegisterClass *RC =
4344 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4345
4346 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4347 Register VReg = MF.addLiveIn(Reg, RC);
4348 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4349 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4350 MachinePointerInfo(OrigArg, 4 * i));
4351 MemOps.push_back(Store);
4352 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4353 }
4354
4355 if (!MemOps.empty())
4356 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4357 return FrameIndex;
4358}
4359
4360// Setup stack frame, the va_list pointer will start from.
4361void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4362 const SDLoc &dl, SDValue &Chain,
4363 unsigned ArgOffset,
4364 unsigned TotalArgRegsSaveSize,
4365 bool ForceMutable) const {
4366 MachineFunction &MF = DAG.getMachineFunction();
4367 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4368
4369 // Try to store any remaining integer argument regs
4370 // to their spots on the stack so that they may be loaded by dereferencing
4371 // the result of va_next.
4372 // If there is no regs to be stored, just point address after last
4373 // argument passed via stack.
4374 int FrameIndex = StoreByValRegs(
4375 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4376 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4377 AFI->setVarArgsFrameIndex(FrameIndex);
4378}
4379
4380bool ARMTargetLowering::splitValueIntoRegisterParts(
4381 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4382 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4383 EVT ValueVT = Val.getValueType();
4384 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4385 unsigned ValueBits = ValueVT.getSizeInBits();
4386 unsigned PartBits = PartVT.getSizeInBits();
4387 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4388 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4389 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4390 Parts[0] = Val;
4391 return true;
4392 }
4393 return false;
4394}
4395
4396SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4397 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4398 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4399 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4400 unsigned ValueBits = ValueVT.getSizeInBits();
4401 unsigned PartBits = PartVT.getSizeInBits();
4402 SDValue Val = Parts[0];
4403
4404 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4405 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4406 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4407 return Val;
4408 }
4409 return SDValue();
4410}
4411
4412SDValue ARMTargetLowering::LowerFormalArguments(
4413 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4414 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4415 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4416 MachineFunction &MF = DAG.getMachineFunction();
4417 MachineFrameInfo &MFI = MF.getFrameInfo();
4418
4419 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4420
4421 // Assign locations to all of the incoming arguments.
4423 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4424 *DAG.getContext());
4425 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4426
4428 unsigned CurArgIdx = 0;
4429
4430 // Initially ArgRegsSaveSize is zero.
4431 // Then we increase this value each time we meet byval parameter.
4432 // We also increase this value in case of varargs function.
4433 AFI->setArgRegsSaveSize(0);
4434
4435 // Calculate the amount of stack space that we need to allocate to store
4436 // byval and variadic arguments that are passed in registers.
4437 // We need to know this before we allocate the first byval or variadic
4438 // argument, as they will be allocated a stack slot below the CFA (Canonical
4439 // Frame Address, the stack pointer at entry to the function).
4440 unsigned ArgRegBegin = ARM::R4;
4441 for (const CCValAssign &VA : ArgLocs) {
4442 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4443 break;
4444
4445 unsigned Index = VA.getValNo();
4446 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4447 if (!Flags.isByVal())
4448 continue;
4449
4450 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4451 unsigned RBegin, REnd;
4452 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4453 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4454
4455 CCInfo.nextInRegsParam();
4456 }
4457 CCInfo.rewindByValRegsInfo();
4458
4459 int lastInsIndex = -1;
4460 if (isVarArg && MFI.hasVAStart()) {
4461 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4462 if (RegIdx != std::size(GPRArgRegs))
4463 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4464 }
4465
4466 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4467 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4468 auto PtrVT = getPointerTy(DAG.getDataLayout());
4469
4470 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4471 CCValAssign &VA = ArgLocs[i];
4472 if (Ins[VA.getValNo()].isOrigArg()) {
4473 std::advance(CurOrigArg,
4474 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4475 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4476 }
4477 // Arguments stored in registers.
4478 if (VA.isRegLoc()) {
4479 EVT RegVT = VA.getLocVT();
4480 SDValue ArgValue;
4481
4482 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4483 // f64 and vector types are split up into multiple registers or
4484 // combinations of registers and stack slots.
4485 SDValue ArgValue1 =
4486 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4487 VA = ArgLocs[++i]; // skip ahead to next loc
4488 SDValue ArgValue2;
4489 if (VA.isMemLoc()) {
4490 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4491 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4492 ArgValue2 = DAG.getLoad(
4493 MVT::f64, dl, Chain, FIN,
4495 } else {
4496 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4497 }
4498 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4499 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4500 ArgValue1, DAG.getIntPtrConstant(0, dl));
4501 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4502 ArgValue2, DAG.getIntPtrConstant(1, dl));
4503 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4504 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4505 } else {
4506 const TargetRegisterClass *RC;
4507
4508 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4509 RC = &ARM::HPRRegClass;
4510 else if (RegVT == MVT::f32)
4511 RC = &ARM::SPRRegClass;
4512 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4513 RegVT == MVT::v4bf16)
4514 RC = &ARM::DPRRegClass;
4515 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4516 RegVT == MVT::v8bf16)
4517 RC = &ARM::QPRRegClass;
4518 else if (RegVT == MVT::i32)
4519 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4520 : &ARM::GPRRegClass;
4521 else
4522 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4523
4524 // Transform the arguments in physical registers into virtual ones.
4525 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4526 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4527
4528 // If this value is passed in r0 and has the returned attribute (e.g.
4529 // C++ 'structors), record this fact for later use.
4530 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4531 AFI->setPreservesR0();
4532 }
4533 }
4534
4535 // If this is an 8 or 16-bit value, it is really passed promoted
4536 // to 32 bits. Insert an assert[sz]ext to capture this, then
4537 // truncate to the right size.
4538 switch (VA.getLocInfo()) {
4539 default: llvm_unreachable("Unknown loc info!");
4540 case CCValAssign::Full: break;
4541 case CCValAssign::BCvt:
4542 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4543 break;
4544 }
4545
4546 // f16 arguments have their size extended to 4 bytes and passed as if they
4547 // had been copied to the LSBs of a 32-bit register.
4548 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4549 if (VA.needsCustom() &&
4550 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4551 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4552
4553 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4554 // less than 32 bits must be sign- or zero-extended in the callee for
4555 // security reasons. Although the ABI mandates an extension done by the
4556 // caller, the latter cannot be trusted to follow the rules of the ABI.
4557 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4558 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4559 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4560 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4561
4562 InVals.push_back(ArgValue);
4563 } else { // VA.isRegLoc()
4564 // Only arguments passed on the stack should make it here.
4565 assert(VA.isMemLoc());
4566 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4567
4568 int index = VA.getValNo();
4569
4570 // Some Ins[] entries become multiple ArgLoc[] entries.
4571 // Process them only once.
4572 if (index != lastInsIndex)
4573 {
4574 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4575 // FIXME: For now, all byval parameter objects are marked mutable.
4576 // This can be changed with more analysis.
4577 // In case of tail call optimization mark all arguments mutable.
4578 // Since they could be overwritten by lowering of arguments in case of
4579 // a tail call.
4580 if (Flags.isByVal()) {
4581 assert(Ins[index].isOrigArg() &&
4582 "Byval arguments cannot be implicit");
4583 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4584
4585 int FrameIndex = StoreByValRegs(
4586 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4587 VA.getLocMemOffset(), Flags.getByValSize());
4588 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4589 CCInfo.nextInRegsParam();
4590 } else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 ||
4591 VA.getValVT() == MVT::bf16)) {
4592 // f16 and bf16 values are passed in the least-significant half of
4593 // a 4 byte stack slot. This is done as-if the extension was done
4594 // in a 32-bit register, so the actual bytes used for the value
4595 // differ between little and big endian.
4596 assert(VA.getLocVT().getSizeInBits() == 32);
4597 unsigned FIOffset = VA.getLocMemOffset();
4598 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8,
4599 FIOffset, true);
4600
4601 SDValue Addr = DAG.getFrameIndex(FI, PtrVT);
4602 if (DAG.getDataLayout().isBigEndian())
4603 Addr = DAG.getObjectPtrOffset(dl, Addr, TypeSize::getFixed(2));
4604
4605 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr,
4607 DAG.getMachineFunction(), FI)));
4608
4609 } else {
4610 unsigned FIOffset = VA.getLocMemOffset();
4611 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4612 FIOffset, true);
4613
4614 // Create load nodes to retrieve arguments from the stack.
4615 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4616 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4618 DAG.getMachineFunction(), FI)));
4619 }
4620 lastInsIndex = index;
4621 }
4622 }
4623 }
4624
4625 // varargs
4626 if (isVarArg && MFI.hasVAStart()) {
4627 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4628 TotalArgRegsSaveSize);
4629 if (AFI->isCmseNSEntryFunction()) {
4630 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4632 "secure entry function must not be variadic", dl.getDebugLoc()));
4633 }
4634 }
4635
4636 unsigned StackArgSize = CCInfo.getStackSize();
4637 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4638 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4639 // The only way to guarantee a tail call is if the callee restores its
4640 // argument area, but it must also keep the stack aligned when doing so.
4641 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
4642 assert(StackAlign && "data layout string is missing stack alignment");
4643 StackArgSize = alignTo(StackArgSize, *StackAlign);
4644
4645 AFI->setArgumentStackToRestore(StackArgSize);
4646 }
4647 AFI->setArgumentStackSize(StackArgSize);
4648
4649 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4650 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4652 "secure entry function requires arguments on stack", dl.getDebugLoc()));
4653 }
4654
4655 return Chain;
4656}
4657
4658/// isFloatingPointZero - Return true if this is +0.0.
4661 return CFP->getValueAPF().isPosZero();
4662 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4663 // Maybe this has already been legalized into the constant pool?
4664 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4665 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4667 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4668 return CFP->getValueAPF().isPosZero();
4669 }
4670 } else if (Op->getOpcode() == ISD::BITCAST &&
4671 Op->getValueType(0) == MVT::f64) {
4672 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4673 // created by LowerConstantFP().
4674 SDValue BitcastOp = Op->getOperand(0);
4675 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4676 isNullConstant(BitcastOp->getOperand(0)))
4677 return true;
4678 }
4679 return false;
4680}
4681
4682/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4683/// the given operands.
4684SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4685 SDValue &ARMcc, SelectionDAG &DAG,
4686 const SDLoc &dl) const {
4687 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4688 unsigned C = RHSC->getZExtValue();
4689 if (!isLegalICmpImmediate((int32_t)C)) {
4690 // Constant does not fit, try adjusting it by one.
4691 switch (CC) {
4692 default: break;
4693 case ISD::SETLT:
4694 case ISD::SETGE:
4695 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4696 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4697 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4698 }
4699 break;
4700 case ISD::SETULT:
4701 case ISD::SETUGE:
4702 if (C != 0 && isLegalICmpImmediate(C-1)) {
4703 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4704 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4705 }
4706 break;
4707 case ISD::SETLE:
4708 case ISD::SETGT:
4709 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4710 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4711 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4712 }
4713 break;
4714 case ISD::SETULE:
4715 case ISD::SETUGT:
4716 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4717 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4718 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4719 }
4720 break;
4721 }
4722 }
4723 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4725 // In ARM and Thumb-2, the compare instructions can shift their second
4726 // operand.
4728 std::swap(LHS, RHS);
4729 }
4730
4731 // Thumb1 has very limited immediate modes, so turning an "and" into a
4732 // shift can save multiple instructions.
4733 //
4734 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4735 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4736 // own. If it's the operand to an unsigned comparison with an immediate,
4737 // we can eliminate one of the shifts: we transform
4738 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4739 //
4740 // We avoid transforming cases which aren't profitable due to encoding
4741 // details:
4742 //
4743 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4744 // would not; in that case, we're essentially trading one immediate load for
4745 // another.
4746 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4747 // 3. C2 is zero; we have other code for this special case.
4748 //
4749 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4750 // instruction, since the AND is always one instruction anyway, but we could
4751 // use narrow instructions in some cases.
4752 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4753 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4754 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4755 !isSignedIntSetCC(CC)) {
4756 unsigned Mask = LHS.getConstantOperandVal(1);
4757 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4758 uint64_t RHSV = RHSC->getZExtValue();
4759 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4760 unsigned ShiftBits = llvm::countl_zero(Mask);
4761 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4762 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4763 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4764 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4765 }
4766 }
4767 }
4768
4769 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4770 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4771 // way a cmp would.
4772 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4773 // some tweaks to the heuristics for the previous and->shift transform.
4774 // FIXME: Optimize cases where the LHS isn't a shift.
4775 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4776 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4777 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4778 LHS.getConstantOperandVal(1) < 31) {
4779 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4780 SDValue Shift =
4781 DAG.getNode(ARMISD::LSLS, dl, DAG.getVTList(MVT::i32, FlagsVT),
4782 LHS.getOperand(0), DAG.getConstant(ShiftAmt, dl, MVT::i32));
4783 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4784 return Shift.getValue(1);
4785 }
4786
4788
4789 // If the RHS is a constant zero then the V (overflow) flag will never be
4790 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4791 // simpler for other passes (like the peephole optimiser) to deal with.
4792 if (isNullConstant(RHS)) {
4793 switch (CondCode) {
4794 default: break;
4795 case ARMCC::GE:
4797 break;
4798 case ARMCC::LT:
4800 break;
4801 }
4802 }
4803
4804 ARMISD::NodeType CompareType;
4805 switch (CondCode) {
4806 default:
4807 CompareType = ARMISD::CMP;
4808 break;
4809 case ARMCC::EQ:
4810 case ARMCC::NE:
4811 // Uses only Z Flag
4812 CompareType = ARMISD::CMPZ;
4813 break;
4814 }
4815 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4816 return DAG.getNode(CompareType, dl, FlagsVT, LHS, RHS);
4817}
4818
4819/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4820SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4821 SelectionDAG &DAG, const SDLoc &dl,
4822 bool Signaling) const {
4823 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4824 SDValue Flags;
4826 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, dl, FlagsVT,
4827 LHS, RHS);
4828 else
4829 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, dl,
4830 FlagsVT, LHS);
4831 return DAG.getNode(ARMISD::FMSTAT, dl, FlagsVT, Flags);
4832}
4833
4834// This function returns three things: the arithmetic computation itself
4835// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4836// comparison and the condition code define the case in which the arithmetic
4837// computation *does not* overflow.
4838std::pair<SDValue, SDValue>
4839ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4840 SDValue &ARMcc) const {
4841 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4842
4843 SDValue Value, OverflowCmp;
4844 SDValue LHS = Op.getOperand(0);
4845 SDValue RHS = Op.getOperand(1);
4846 SDLoc dl(Op);
4847
4848 // FIXME: We are currently always generating CMPs because we don't support
4849 // generating CMN through the backend. This is not as good as the natural
4850 // CMP case because it causes a register dependency and cannot be folded
4851 // later.
4852
4853 switch (Op.getOpcode()) {
4854 default:
4855 llvm_unreachable("Unknown overflow instruction!");
4856 case ISD::SADDO:
4857 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4858 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4859 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4860 break;
4861 case ISD::UADDO:
4862 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4863 // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4864 // We do not use it in the USUBO case as Value may not be used.
4865 Value = DAG.getNode(ARMISD::ADDC, dl,
4866 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4867 .getValue(0);
4868 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4869 break;
4870 case ISD::SSUBO:
4871 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4872 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4873 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4874 break;
4875 case ISD::USUBO:
4876 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4877 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4878 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4879 break;
4880 case ISD::UMULO:
4881 // We generate a UMUL_LOHI and then check if the high word is 0.
4882 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4883 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4884 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4885 LHS, RHS);
4886 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4887 DAG.getConstant(0, dl, MVT::i32));
4888 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4889 break;
4890 case ISD::SMULO:
4891 // We generate a SMUL_LOHI and then check if all the bits of the high word
4892 // are the same as the sign bit of the low word.
4893 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4894 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4895 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4896 LHS, RHS);
4897 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4898 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4899 Value.getValue(0),
4900 DAG.getConstant(31, dl, MVT::i32)));
4901 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4902 break;
4903 } // switch (...)
4904
4905 return std::make_pair(Value, OverflowCmp);
4906}
4907
4908SDValue
4909ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4910 // Let legalize expand this if it isn't a legal type yet.
4911 if (!isTypeLegal(Op.getValueType()))
4912 return SDValue();
4913
4914 SDValue Value, OverflowCmp;
4915 SDValue ARMcc;
4916 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4917 SDLoc dl(Op);
4918 // We use 0 and 1 as false and true values.
4919 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4920 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4921 EVT VT = Op.getValueType();
4922
4923 SDValue Overflow =
4924 DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, ARMcc, OverflowCmp);
4925
4926 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4927 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4928}
4929
4931 SelectionDAG &DAG) {
4932 SDLoc DL(BoolCarry);
4933 EVT CarryVT = BoolCarry.getValueType();
4934
4935 // This converts the boolean value carry into the carry flag by doing
4936 // ARMISD::SUBC Carry, 1
4937 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4938 DAG.getVTList(CarryVT, MVT::i32),
4939 BoolCarry, DAG.getConstant(1, DL, CarryVT));
4940 return Carry.getValue(1);
4941}
4942
4944 SelectionDAG &DAG) {
4945 SDLoc DL(Flags);
4946
4947 // Now convert the carry flag into a boolean carry. We do this
4948 // using ARMISD:ADDE 0, 0, Carry
4949 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4950 DAG.getConstant(0, DL, MVT::i32),
4951 DAG.getConstant(0, DL, MVT::i32), Flags);
4952}
4953
4954SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
4955 SelectionDAG &DAG) const {
4956 // Let legalize expand this if it isn't a legal type yet.
4957 if (!isTypeLegal(Op.getValueType()))
4958 return SDValue();
4959
4960 SDValue LHS = Op.getOperand(0);
4961 SDValue RHS = Op.getOperand(1);
4962 SDLoc dl(Op);
4963
4964 EVT VT = Op.getValueType();
4965 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4966 SDValue Value;
4967 SDValue Overflow;
4968 switch (Op.getOpcode()) {
4969 default:
4970 llvm_unreachable("Unknown overflow instruction!");
4971 case ISD::UADDO:
4972 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4973 // Convert the carry flag into a boolean value.
4974 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4975 break;
4976 case ISD::USUBO: {
4977 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4978 // Convert the carry flag into a boolean value.
4979 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4980 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4981 // value. So compute 1 - C.
4982 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4983 DAG.getConstant(1, dl, MVT::i32), Overflow);
4984 break;
4985 }
4986 }
4987
4988 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4989}
4990
4992 const ARMSubtarget *Subtarget) {
4993 EVT VT = Op.getValueType();
4994 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
4995 return SDValue();
4996 if (!VT.isSimple())
4997 return SDValue();
4998
4999 unsigned NewOpcode;
5000 switch (VT.getSimpleVT().SimpleTy) {
5001 default:
5002 return SDValue();
5003 case MVT::i8:
5004 switch (Op->getOpcode()) {
5005 case ISD::UADDSAT:
5006 NewOpcode = ARMISD::UQADD8b;
5007 break;
5008 case ISD::SADDSAT:
5009 NewOpcode = ARMISD::QADD8b;
5010 break;
5011 case ISD::USUBSAT:
5012 NewOpcode = ARMISD::UQSUB8b;
5013 break;
5014 case ISD::SSUBSAT:
5015 NewOpcode = ARMISD::QSUB8b;
5016 break;
5017 }
5018 break;
5019 case MVT::i16:
5020 switch (Op->getOpcode()) {
5021 case ISD::UADDSAT:
5022 NewOpcode = ARMISD::UQADD16b;
5023 break;
5024 case ISD::SADDSAT:
5025 NewOpcode = ARMISD::QADD16b;
5026 break;
5027 case ISD::USUBSAT:
5028 NewOpcode = ARMISD::UQSUB16b;
5029 break;
5030 case ISD::SSUBSAT:
5031 NewOpcode = ARMISD::QSUB16b;
5032 break;
5033 }
5034 break;
5035 }
5036
5037 SDLoc dl(Op);
5038 SDValue Add =
5039 DAG.getNode(NewOpcode, dl, MVT::i32,
5040 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
5041 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
5042 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
5043}
5044
5045SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
5046 SDValue Cond = Op.getOperand(0);
5047 SDValue SelectTrue = Op.getOperand(1);
5048 SDValue SelectFalse = Op.getOperand(2);
5049 SDLoc dl(Op);
5050 unsigned Opc = Cond.getOpcode();
5051
5052 if (Cond.getResNo() == 1 &&
5053 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5054 Opc == ISD::USUBO)) {
5055 if (!isTypeLegal(Cond->getValueType(0)))
5056 return SDValue();
5057
5058 SDValue Value, OverflowCmp;
5059 SDValue ARMcc;
5060 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5061 EVT VT = Op.getValueType();
5062
5063 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, OverflowCmp, DAG);
5064 }
5065
5066 // Convert:
5067 //
5068 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
5069 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
5070 //
5071 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
5072 const ConstantSDNode *CMOVTrue =
5073 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
5074 const ConstantSDNode *CMOVFalse =
5075 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
5076
5077 if (CMOVTrue && CMOVFalse) {
5078 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
5079 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
5080
5081 SDValue True;
5082 SDValue False;
5083 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
5084 True = SelectTrue;
5085 False = SelectFalse;
5086 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
5087 True = SelectFalse;
5088 False = SelectTrue;
5089 }
5090
5091 if (True.getNode() && False.getNode())
5092 return getCMOV(dl, Op.getValueType(), True, False, Cond.getOperand(2),
5093 Cond.getOperand(3), DAG);
5094 }
5095 }
5096
5097 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
5098 // undefined bits before doing a full-word comparison with zero.
5099 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
5100 DAG.getConstant(1, dl, Cond.getValueType()));
5101
5102 return DAG.getSelectCC(dl, Cond,
5103 DAG.getConstant(0, dl, Cond.getValueType()),
5104 SelectTrue, SelectFalse, ISD::SETNE);
5105}
5106
5108 bool &swpCmpOps, bool &swpVselOps) {
5109 // Start by selecting the GE condition code for opcodes that return true for
5110 // 'equality'
5111 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
5112 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
5113 CondCode = ARMCC::GE;
5114
5115 // and GT for opcodes that return false for 'equality'.
5116 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
5117 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
5118 CondCode = ARMCC::GT;
5119
5120 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
5121 // to swap the compare operands.
5122 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
5123 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
5124 swpCmpOps = true;
5125
5126 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
5127 // If we have an unordered opcode, we need to swap the operands to the VSEL
5128 // instruction (effectively negating the condition).
5129 //
5130 // This also has the effect of swapping which one of 'less' or 'greater'
5131 // returns true, so we also swap the compare operands. It also switches
5132 // whether we return true for 'equality', so we compensate by picking the
5133 // opposite condition code to our original choice.
5134 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
5135 CC == ISD::SETUGT) {
5136 swpCmpOps = !swpCmpOps;
5137 swpVselOps = !swpVselOps;
5138 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
5139 }
5140
5141 // 'ordered' is 'anything but unordered', so use the VS condition code and
5142 // swap the VSEL operands.
5143 if (CC == ISD::SETO) {
5144 CondCode = ARMCC::VS;
5145 swpVselOps = true;
5146 }
5147
5148 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
5149 // code and swap the VSEL operands. Also do this if we don't care about the
5150 // unordered case.
5151 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
5152 CondCode = ARMCC::EQ;
5153 swpVselOps = true;
5154 }
5155}
5156
5157SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
5158 SDValue TrueVal, SDValue ARMcc,
5159 SDValue Flags, SelectionDAG &DAG) const {
5160 if (!Subtarget->hasFP64() && VT == MVT::f64) {
5162 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
5164 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
5165
5166 SDValue TrueLow = TrueVal.getValue(0);
5167 SDValue TrueHigh = TrueVal.getValue(1);
5168 SDValue FalseLow = FalseVal.getValue(0);
5169 SDValue FalseHigh = FalseVal.getValue(1);
5170
5171 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
5172 ARMcc, Flags);
5173 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
5174 ARMcc, Flags);
5175
5176 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
5177 }
5178 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, Flags);
5179}
5180
5181static bool isGTorGE(ISD::CondCode CC) {
5182 return CC == ISD::SETGT || CC == ISD::SETGE;
5183}
5184
5185static bool isLTorLE(ISD::CondCode CC) {
5186 return CC == ISD::SETLT || CC == ISD::SETLE;
5187}
5188
5189// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5190// All of these conditions (and their <= and >= counterparts) will do:
5191// x < k ? k : x
5192// x > k ? x : k
5193// k < x ? x : k
5194// k > x ? k : x
5195static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5196 const SDValue TrueVal, const SDValue FalseVal,
5197 const ISD::CondCode CC, const SDValue K) {
5198 return (isGTorGE(CC) &&
5199 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5200 (isLTorLE(CC) &&
5201 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5202}
5203
5204// Check if two chained conditionals could be converted into SSAT or USAT.
5205//
5206// SSAT can replace a set of two conditional selectors that bound a number to an
5207// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5208//
5209// x < -k ? -k : (x > k ? k : x)
5210// x < -k ? -k : (x < k ? x : k)
5211// x > -k ? (x > k ? k : x) : -k
5212// x < k ? (x < -k ? -k : x) : k
5213// etc.
5214//
5215// LLVM canonicalizes these to either a min(max()) or a max(min())
5216// pattern. This function tries to match one of these and will return a SSAT
5217// node if successful.
5218//
5219// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5220// is a power of 2.
5222 EVT VT = Op.getValueType();
5223 SDValue V1 = Op.getOperand(0);
5224 SDValue K1 = Op.getOperand(1);
5225 SDValue TrueVal1 = Op.getOperand(2);
5226 SDValue FalseVal1 = Op.getOperand(3);
5227 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5228
5229 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5230 if (Op2.getOpcode() != ISD::SELECT_CC)
5231 return SDValue();
5232
5233 SDValue V2 = Op2.getOperand(0);
5234 SDValue K2 = Op2.getOperand(1);
5235 SDValue TrueVal2 = Op2.getOperand(2);
5236 SDValue FalseVal2 = Op2.getOperand(3);
5237 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5238
5239 SDValue V1Tmp = V1;
5240 SDValue V2Tmp = V2;
5241
5242 // Check that the registers and the constants match a max(min()) or min(max())
5243 // pattern
5244 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5245 K2 != FalseVal2 ||
5246 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5247 return SDValue();
5248
5249 // Check that the constant in the lower-bound check is
5250 // the opposite of the constant in the upper-bound check
5251 // in 1's complement.
5253 return SDValue();
5254
5255 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5256 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5257 int64_t PosVal = std::max(Val1, Val2);
5258 int64_t NegVal = std::min(Val1, Val2);
5259
5260 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5261 !isPowerOf2_64(PosVal + 1))
5262 return SDValue();
5263
5264 // Handle the difference between USAT (unsigned) and SSAT (signed)
5265 // saturation
5266 // At this point, PosVal is guaranteed to be positive
5267 uint64_t K = PosVal;
5268 SDLoc dl(Op);
5269 if (Val1 == ~Val2)
5270 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5271 DAG.getConstant(llvm::countr_one(K), dl, VT));
5272 if (NegVal == 0)
5273 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5274 DAG.getConstant(llvm::countr_one(K), dl, VT));
5275
5276 return SDValue();
5277}
5278
5279// Check if a condition of the type x < k ? k : x can be converted into a
5280// bit operation instead of conditional moves.
5281// Currently this is allowed given:
5282// - The conditions and values match up
5283// - k is 0 or -1 (all ones)
5284// This function will not check the last condition, thats up to the caller
5285// It returns true if the transformation can be made, and in such case
5286// returns x in V, and k in SatK.
5288 SDValue &SatK)
5289{
5290 SDValue LHS = Op.getOperand(0);
5291 SDValue RHS = Op.getOperand(1);
5292 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5293 SDValue TrueVal = Op.getOperand(2);
5294 SDValue FalseVal = Op.getOperand(3);
5295
5297 ? &RHS
5298 : nullptr;
5299
5300 // No constant operation in comparison, early out
5301 if (!K)
5302 return false;
5303
5304 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5305 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5306 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5307
5308 // If the constant on left and right side, or variable on left and right,
5309 // does not match, early out
5310 if (*K != KTmp || V != VTmp)
5311 return false;
5312
5313 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5314 SatK = *K;
5315 return true;
5316 }
5317
5318 return false;
5319}
5320
5321bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5322 if (VT == MVT::f32)
5323 return !Subtarget->hasVFP2Base();
5324 if (VT == MVT::f64)
5325 return !Subtarget->hasFP64();
5326 if (VT == MVT::f16)
5327 return !Subtarget->hasFullFP16();
5328 return false;
5329}
5330
5331SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5332 EVT VT = Op.getValueType();
5333 SDLoc dl(Op);
5334
5335 // Try to convert two saturating conditional selects into a single SSAT
5336 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5337 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5338 return SatValue;
5339
5340 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5341 // into more efficient bit operations, which is possible when k is 0 or -1
5342 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5343 // single instructions. On Thumb the shift and the bit operation will be two
5344 // instructions.
5345 // Only allow this transformation on full-width (32-bit) operations
5346 SDValue LowerSatConstant;
5347 SDValue SatValue;
5348 if (VT == MVT::i32 &&
5349 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5350 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5351 DAG.getConstant(31, dl, VT));
5352 if (isNullConstant(LowerSatConstant)) {
5353 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5354 DAG.getAllOnesConstant(dl, VT));
5355 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5356 } else if (isAllOnesConstant(LowerSatConstant))
5357 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5358 }
5359
5360 SDValue LHS = Op.getOperand(0);
5361 SDValue RHS = Op.getOperand(1);
5362 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5363 SDValue TrueVal = Op.getOperand(2);
5364 SDValue FalseVal = Op.getOperand(3);
5365 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5366 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5367 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
5368 if (Op.getValueType().isInteger()) {
5369
5370 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
5371 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
5372 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
5373 // Both require less instructions than compare and conditional select.
5374 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TrueVal && RHSC &&
5375 RHSC->isZero() && CFVal && CFVal->isZero() &&
5376 LHS.getValueType() == RHS.getValueType()) {
5377 EVT VT = LHS.getValueType();
5378 SDValue Shift =
5379 DAG.getNode(ISD::SRA, dl, VT, LHS,
5380 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5381
5382 if (CC == ISD::SETGT)
5383 Shift = DAG.getNOT(dl, Shift, VT);
5384
5385 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
5386 }
5387 }
5388
5389 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5390 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5391 unsigned TVal = CTVal->getZExtValue();
5392 unsigned FVal = CFVal->getZExtValue();
5393 unsigned Opcode = 0;
5394
5395 if (TVal == ~FVal) {
5396 Opcode = ARMISD::CSINV;
5397 } else if (TVal == ~FVal + 1) {
5398 Opcode = ARMISD::CSNEG;
5399 } else if (TVal + 1 == FVal) {
5400 Opcode = ARMISD::CSINC;
5401 } else if (TVal == FVal + 1) {
5402 Opcode = ARMISD::CSINC;
5403 std::swap(TrueVal, FalseVal);
5404 std::swap(TVal, FVal);
5405 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5406 }
5407
5408 if (Opcode) {
5409 // If one of the constants is cheaper than another, materialise the
5410 // cheaper one and let the csel generate the other.
5411 if (Opcode != ARMISD::CSINC &&
5412 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5413 std::swap(TrueVal, FalseVal);
5414 std::swap(TVal, FVal);
5415 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5416 }
5417
5418 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5419 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5420 // -(-a) == a, but (a+1)+1 != a).
5421 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5422 std::swap(TrueVal, FalseVal);
5423 std::swap(TVal, FVal);
5424 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5425 }
5426
5427 // Drops F's value because we can get it by inverting/negating TVal.
5428 FalseVal = TrueVal;
5429
5430 SDValue ARMcc;
5431 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5432 EVT VT = TrueVal.getValueType();
5433 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
5434 }
5435 }
5436
5437 if (isUnsupportedFloatingType(LHS.getValueType())) {
5438 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5439
5440 // If softenSetCCOperands only returned one value, we should compare it to
5441 // zero.
5442 if (!RHS.getNode()) {
5443 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5444 CC = ISD::SETNE;
5445 }
5446 }
5447
5448 if (LHS.getValueType() == MVT::i32) {
5449 // Try to generate VSEL on ARMv8.
5450 // The VSEL instruction can't use all the usual ARM condition
5451 // codes: it only has two bits to select the condition code, so it's
5452 // constrained to use only GE, GT, VS and EQ.
5453 //
5454 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5455 // swap the operands of the previous compare instruction (effectively
5456 // inverting the compare condition, swapping 'less' and 'greater') and
5457 // sometimes need to swap the operands to the VSEL (which inverts the
5458 // condition in the sense of firing whenever the previous condition didn't)
5459 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5460 TrueVal.getValueType() == MVT::f32 ||
5461 TrueVal.getValueType() == MVT::f64)) {
5463 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5464 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5465 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5466 std::swap(TrueVal, FalseVal);
5467 }
5468 }
5469
5470 SDValue ARMcc;
5471 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5472 // Choose GE over PL, which vsel does now support
5473 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5474 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5475 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5476 }
5477
5478 ARMCC::CondCodes CondCode, CondCode2;
5479 FPCCToARMCC(CC, CondCode, CondCode2);
5480
5481 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5482 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5483 // must use VSEL (limited condition codes), due to not having conditional f16
5484 // moves.
5485 if (Subtarget->hasFPARMv8Base() &&
5486 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5487 (TrueVal.getValueType() == MVT::f16 ||
5488 TrueVal.getValueType() == MVT::f32 ||
5489 TrueVal.getValueType() == MVT::f64)) {
5490 bool swpCmpOps = false;
5491 bool swpVselOps = false;
5492 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5493
5494 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5495 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5496 if (swpCmpOps)
5497 std::swap(LHS, RHS);
5498 if (swpVselOps)
5499 std::swap(TrueVal, FalseVal);
5500 }
5501 }
5502
5503 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5504 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5505 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5506 if (CondCode2 != ARMCC::AL) {
5507 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5508 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, Cmp, DAG);
5509 }
5510 return Result;
5511}
5512
5513/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5514/// to morph to an integer compare sequence.
5515static bool canChangeToInt(SDValue Op, bool &SeenZero,
5516 const ARMSubtarget *Subtarget) {
5517 SDNode *N = Op.getNode();
5518 if (!N->hasOneUse())
5519 // Otherwise it requires moving the value from fp to integer registers.
5520 return false;
5521 if (!N->getNumValues())
5522 return false;
5523 EVT VT = Op.getValueType();
5524 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5525 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5526 // vmrs are very slow, e.g. cortex-a8.
5527 return false;
5528
5529 if (isFloatingPointZero(Op)) {
5530 SeenZero = true;
5531 return true;
5532 }
5533 return ISD::isNormalLoad(N);
5534}
5535
5538 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5539
5541 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5542 Ld->getPointerInfo(), Ld->getAlign(),
5543 Ld->getMemOperand()->getFlags());
5544
5545 llvm_unreachable("Unknown VFP cmp argument!");
5546}
5547
5549 SDValue &RetVal1, SDValue &RetVal2) {
5550 SDLoc dl(Op);
5551
5552 if (isFloatingPointZero(Op)) {
5553 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5554 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5555 return;
5556 }
5557
5558 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5559 SDValue Ptr = Ld->getBasePtr();
5560 RetVal1 =
5561 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5562 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5563
5564 EVT PtrType = Ptr.getValueType();
5565 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5566 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5567 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5568 Ld->getPointerInfo().getWithOffset(4),
5569 commonAlignment(Ld->getAlign(), 4),
5570 Ld->getMemOperand()->getFlags());
5571 return;
5572 }
5573
5574 llvm_unreachable("Unknown VFP cmp argument!");
5575}
5576
5577/// OptimizeVFPBrcond - With nnan and without daz, it's legal to optimize some
5578/// f32 and even f64 comparisons to integer ones.
5579SDValue
5580ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5581 SDValue Chain = Op.getOperand(0);
5582 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5583 SDValue LHS = Op.getOperand(2);
5584 SDValue RHS = Op.getOperand(3);
5585 SDValue Dest = Op.getOperand(4);
5586 SDLoc dl(Op);
5587
5588 bool LHSSeenZero = false;
5589 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5590 bool RHSSeenZero = false;
5591 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5592 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5593 // If unsafe fp math optimization is enabled and there are no other uses of
5594 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5595 // to an integer comparison.
5596 if (CC == ISD::SETOEQ)
5597 CC = ISD::SETEQ;
5598 else if (CC == ISD::SETUNE)
5599 CC = ISD::SETNE;
5600
5601 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5602 SDValue ARMcc;
5603 if (LHS.getValueType() == MVT::f32) {
5604 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5605 bitcastf32Toi32(LHS, DAG), Mask);
5606 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5607 bitcastf32Toi32(RHS, DAG), Mask);
5608 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5609 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5610 Cmp);
5611 }
5612
5613 SDValue LHS1, LHS2;
5614 SDValue RHS1, RHS2;
5615 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5616 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5617 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5618 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5620 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5621 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5622 return DAG.getNode(ARMISD::BCC_i64, dl, MVT::Other, Ops);
5623 }
5624
5625 return SDValue();
5626}
5627
5628// Generate CMP + CMOV for integer abs.
5629SDValue ARMTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5630 SDLoc DL(Op);
5631
5632 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, MVT::i32);
5633
5634 // Generate CMP & CMOV.
5635 SDValue Cmp = DAG.getNode(ARMISD::CMP, DL, FlagsVT, Op.getOperand(0),
5636 DAG.getConstant(0, DL, MVT::i32));
5637 return DAG.getNode(ARMISD::CMOV, DL, MVT::i32, Op.getOperand(0), Neg,
5638 DAG.getConstant(ARMCC::MI, DL, MVT::i32), Cmp);
5639}
5640
5641SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5642 SDValue Chain = Op.getOperand(0);
5643 SDValue Cond = Op.getOperand(1);
5644 SDValue Dest = Op.getOperand(2);
5645 SDLoc dl(Op);
5646
5647 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5648 // instruction.
5649 unsigned Opc = Cond.getOpcode();
5650 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5651 !Subtarget->isThumb1Only();
5652 if (Cond.getResNo() == 1 &&
5653 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5654 Opc == ISD::USUBO || OptimizeMul)) {
5655 // Only lower legal XALUO ops.
5656 if (!isTypeLegal(Cond->getValueType(0)))
5657 return SDValue();
5658
5659 // The actual operation with overflow check.
5660 SDValue Value, OverflowCmp;
5661 SDValue ARMcc;
5662 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5663
5664 // Reverse the condition code.
5666 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5668 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5669
5670 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5671 OverflowCmp);
5672 }
5673
5674 return SDValue();
5675}
5676
5677SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5678 SDValue Chain = Op.getOperand(0);
5679 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5680 SDValue LHS = Op.getOperand(2);
5681 SDValue RHS = Op.getOperand(3);
5682 SDValue Dest = Op.getOperand(4);
5683 SDLoc dl(Op);
5684
5685 if (isUnsupportedFloatingType(LHS.getValueType())) {
5686 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5687
5688 // If softenSetCCOperands only returned one value, we should compare it to
5689 // zero.
5690 if (!RHS.getNode()) {
5691 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5692 CC = ISD::SETNE;
5693 }
5694 }
5695
5696 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5697 // instruction.
5698 unsigned Opc = LHS.getOpcode();
5699 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5700 !Subtarget->isThumb1Only();
5701 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5702 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5703 Opc == ISD::USUBO || OptimizeMul) &&
5704 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5705 // Only lower legal XALUO ops.
5706 if (!isTypeLegal(LHS->getValueType(0)))
5707 return SDValue();
5708
5709 // The actual operation with overflow check.
5710 SDValue Value, OverflowCmp;
5711 SDValue ARMcc;
5712 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5713
5714 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5715 // Reverse the condition code.
5717 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5719 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5720 }
5721
5722 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5723 OverflowCmp);
5724 }
5725
5726 if (LHS.getValueType() == MVT::i32) {
5727 SDValue ARMcc;
5728 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5729 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
5730 }
5731
5732 SDNodeFlags Flags = Op->getFlags();
5733 if (Flags.hasNoNaNs() &&
5734 DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
5735 DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE() &&
5736 (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
5737 CC == ISD::SETUNE)) {
5738 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5739 return Result;
5740 }
5741
5742 ARMCC::CondCodes CondCode, CondCode2;
5743 FPCCToARMCC(CC, CondCode, CondCode2);
5744
5745 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5746 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5747 SDValue Ops[] = {Chain, Dest, ARMcc, Cmp};
5748 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5749 if (CondCode2 != ARMCC::AL) {
5750 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5751 SDValue Ops[] = {Res, Dest, ARMcc, Cmp};
5752 Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5753 }
5754 return Res;
5755}
5756
5757SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5758 SDValue Chain = Op.getOperand(0);
5759 SDValue Table = Op.getOperand(1);
5760 SDValue Index = Op.getOperand(2);
5761 SDLoc dl(Op);
5762
5763 EVT PTy = getPointerTy(DAG.getDataLayout());
5764 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5765 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5766 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5767 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5768 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5769 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5770 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5771 // which does another jump to the destination. This also makes it easier
5772 // to translate it to TBB / TBH later (Thumb2 only).
5773 // FIXME: This might not work if the function is extremely large.
5774 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5775 Addr, Op.getOperand(2), JTI);
5776 }
5777 if (isPositionIndependent() || Subtarget->isROPI()) {
5778 Addr =
5779 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5781 Chain = Addr.getValue(1);
5782 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5783 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5784 } else {
5785 Addr =
5786 DAG.getLoad(PTy, dl, Chain, Addr,
5788 Chain = Addr.getValue(1);
5789 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5790 }
5791}
5792
5794 EVT VT = Op.getValueType();
5795 SDLoc dl(Op);
5796
5797 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5798 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5799 return Op;
5800 return DAG.UnrollVectorOp(Op.getNode());
5801 }
5802
5803 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5804
5805 EVT NewTy;
5806 const EVT OpTy = Op.getOperand(0).getValueType();
5807 if (OpTy == MVT::v4f32)
5808 NewTy = MVT::v4i32;
5809 else if (OpTy == MVT::v4f16 && HasFullFP16)
5810 NewTy = MVT::v4i16;
5811 else if (OpTy == MVT::v8f16 && HasFullFP16)
5812 NewTy = MVT::v8i16;
5813 else
5814 llvm_unreachable("Invalid type for custom lowering!");
5815
5816 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5817 return DAG.UnrollVectorOp(Op.getNode());
5818
5819 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5820 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5821}
5822
5823SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5824 EVT VT = Op.getValueType();
5825 if (VT.isVector())
5826 return LowerVectorFP_TO_INT(Op, DAG);
5827
5828 bool IsStrict = Op->isStrictFPOpcode();
5829 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5830
5831 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5832 RTLIB::Libcall LC;
5833 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5834 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5835 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5836 Op.getValueType());
5837 else
5838 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5839 Op.getValueType());
5840 SDLoc Loc(Op);
5841 MakeLibCallOptions CallOptions;
5842 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5844 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5845 CallOptions, Loc, Chain);
5846 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5847 }
5848
5849 // FIXME: Remove this when we have strict fp instruction selection patterns
5850 if (IsStrict) {
5851 SDLoc Loc(Op);
5852 SDValue Result =
5855 Loc, Op.getValueType(), SrcVal);
5856 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5857 }
5858
5859 return Op;
5860}
5861
5863 const ARMSubtarget *Subtarget) {
5864 EVT VT = Op.getValueType();
5865 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5866 EVT FromVT = Op.getOperand(0).getValueType();
5867
5868 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5869 return Op;
5870 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5871 Subtarget->hasFP64())
5872 return Op;
5873 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5874 Subtarget->hasFullFP16())
5875 return Op;
5876 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5877 Subtarget->hasMVEFloatOps())
5878 return Op;
5879 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5880 Subtarget->hasMVEFloatOps())
5881 return Op;
5882
5883 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5884 return SDValue();
5885
5886 SDLoc DL(Op);
5887 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5888 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5889 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5890 DAG.getValueType(VT.getScalarType()));
5891 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5892 DAG.getConstant((1 << BW) - 1, DL, VT));
5893 if (IsSigned)
5894 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5895 DAG.getSignedConstant(-(1 << BW), DL, VT));
5896 return Max;
5897}
5898
5900 EVT VT = Op.getValueType();
5901 SDLoc dl(Op);
5902
5903 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5904 if (VT.getVectorElementType() == MVT::f32)
5905 return Op;
5906 return DAG.UnrollVectorOp(Op.getNode());
5907 }
5908
5909 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5910 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5911 "Invalid type for custom lowering!");
5912
5913 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5914
5915 EVT DestVecType;
5916 if (VT == MVT::v4f32)
5917 DestVecType = MVT::v4i32;
5918 else if (VT == MVT::v4f16 && HasFullFP16)
5919 DestVecType = MVT::v4i16;
5920 else if (VT == MVT::v8f16 && HasFullFP16)
5921 DestVecType = MVT::v8i16;
5922 else
5923 return DAG.UnrollVectorOp(Op.getNode());
5924
5925 unsigned CastOpc;
5926 unsigned Opc;
5927 switch (Op.getOpcode()) {
5928 default: llvm_unreachable("Invalid opcode!");
5929 case ISD::SINT_TO_FP:
5930 CastOpc = ISD::SIGN_EXTEND;
5932 break;
5933 case ISD::UINT_TO_FP:
5934 CastOpc = ISD::ZERO_EXTEND;
5936 break;
5937 }
5938
5939 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5940 return DAG.getNode(Opc, dl, VT, Op);
5941}
5942
5943SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5944 EVT VT = Op.getValueType();
5945 if (VT.isVector())
5946 return LowerVectorINT_TO_FP(Op, DAG);
5947 if (isUnsupportedFloatingType(VT)) {
5948 RTLIB::Libcall LC;
5949 if (Op.getOpcode() == ISD::SINT_TO_FP)
5950 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5951 Op.getValueType());
5952 else
5953 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5954 Op.getValueType());
5955 MakeLibCallOptions CallOptions;
5956 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5957 CallOptions, SDLoc(Op)).first;
5958 }
5959
5960 return Op;
5961}
5962
5963SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5964 // Implement fcopysign with a fabs and a conditional fneg.
5965 SDValue Tmp0 = Op.getOperand(0);
5966 SDValue Tmp1 = Op.getOperand(1);
5967 SDLoc dl(Op);
5968 EVT VT = Op.getValueType();
5969 EVT SrcVT = Tmp1.getValueType();
5970 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5971 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5972 bool UseNEON = !InGPR && Subtarget->hasNEON();
5973
5974 if (UseNEON) {
5975 // Use VBSL to copy the sign bit.
5976 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5977 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5978 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5979 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5980 if (VT == MVT::f64)
5981 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5982 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5983 DAG.getConstant(32, dl, MVT::i32));
5984 else /*if (VT == MVT::f32)*/
5985 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5986 if (SrcVT == MVT::f32) {
5987 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5988 if (VT == MVT::f64)
5989 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5990 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5991 DAG.getConstant(32, dl, MVT::i32));
5992 } else if (VT == MVT::f32)
5993 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5994 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5995 DAG.getConstant(32, dl, MVT::i32));
5996 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5997 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5998
6000 dl, MVT::i32);
6001 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
6002 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
6003 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
6004
6005 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
6006 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
6007 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
6008 if (VT == MVT::f32) {
6009 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
6010 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
6011 DAG.getConstant(0, dl, MVT::i32));
6012 } else {
6013 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
6014 }
6015
6016 return Res;
6017 }
6018
6019 // Bitcast operand 1 to i32.
6020 if (SrcVT == MVT::f64)
6021 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6022 Tmp1).getValue(1);
6023 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
6024
6025 // Or in the signbit with integer operations.
6026 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
6027 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
6028 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
6029 if (VT == MVT::f32) {
6030 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
6031 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
6032 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
6033 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
6034 }
6035
6036 // f64: Or the high part with signbit and then combine two parts.
6037 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6038 Tmp0);
6039 SDValue Lo = Tmp0.getValue(0);
6040 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
6041 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
6042 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
6043}
6044
6045SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
6046 MachineFunction &MF = DAG.getMachineFunction();
6047 MachineFrameInfo &MFI = MF.getFrameInfo();
6048 MFI.setReturnAddressIsTaken(true);
6049
6050 EVT VT = Op.getValueType();
6051 SDLoc dl(Op);
6052 unsigned Depth = Op.getConstantOperandVal(0);
6053 if (Depth) {
6054 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
6055 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
6056 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
6057 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
6058 MachinePointerInfo());
6059 }
6060
6061 // Return LR, which contains the return address. Mark it an implicit live-in.
6062 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
6063 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
6064}
6065
6066SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
6067 const ARMBaseRegisterInfo &ARI =
6068 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
6069 MachineFunction &MF = DAG.getMachineFunction();
6070 MachineFrameInfo &MFI = MF.getFrameInfo();
6071 MFI.setFrameAddressIsTaken(true);
6072
6073 EVT VT = Op.getValueType();
6074 SDLoc dl(Op); // FIXME probably not meaningful
6075 unsigned Depth = Op.getConstantOperandVal(0);
6076 Register FrameReg = ARI.getFrameRegister(MF);
6077 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
6078 while (Depth--)
6079 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
6080 MachinePointerInfo());
6081 return FrameAddr;
6082}
6083
6084// FIXME? Maybe this could be a TableGen attribute on some registers and
6085// this table could be generated automatically from RegInfo.
6086Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
6087 const MachineFunction &MF) const {
6088 return StringSwitch<Register>(RegName)
6089 .Case("sp", ARM::SP)
6090 .Default(Register());
6091}
6092
6093// Result is 64 bit value so split into two 32 bit values and return as a
6094// pair of values.
6096 SelectionDAG &DAG) {
6097 SDLoc DL(N);
6098
6099 // This function is only supposed to be called for i64 type destination.
6100 assert(N->getValueType(0) == MVT::i64
6101 && "ExpandREAD_REGISTER called for non-i64 type result.");
6102
6104 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
6105 N->getOperand(0),
6106 N->getOperand(1));
6107
6108 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
6109 Read.getValue(1)));
6110 Results.push_back(Read.getValue(2)); // Chain
6111}
6112
6113/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
6114/// When \p DstVT, the destination type of \p BC, is on the vector
6115/// register bank and the source of bitcast, \p Op, operates on the same bank,
6116/// it might be possible to combine them, such that everything stays on the
6117/// vector register bank.
6118/// \p return The node that would replace \p BT, if the combine
6119/// is possible.
6121 SelectionDAG &DAG) {
6122 SDValue Op = BC->getOperand(0);
6123 EVT DstVT = BC->getValueType(0);
6124
6125 // The only vector instruction that can produce a scalar (remember,
6126 // since the bitcast was about to be turned into VMOVDRR, the source
6127 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
6128 // Moreover, we can do this combine only if there is one use.
6129 // Finally, if the destination type is not a vector, there is not
6130 // much point on forcing everything on the vector bank.
6131 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6132 !Op.hasOneUse())
6133 return SDValue();
6134
6135 // If the index is not constant, we will introduce an additional
6136 // multiply that will stick.
6137 // Give up in that case.
6138 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
6139 if (!Index)
6140 return SDValue();
6141 unsigned DstNumElt = DstVT.getVectorNumElements();
6142
6143 // Compute the new index.
6144 const APInt &APIntIndex = Index->getAPIntValue();
6145 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
6146 NewIndex *= APIntIndex;
6147 // Check if the new constant index fits into i32.
6148 if (NewIndex.getBitWidth() > 32)
6149 return SDValue();
6150
6151 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
6152 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
6153 SDLoc dl(Op);
6154 SDValue ExtractSrc = Op.getOperand(0);
6155 EVT VecVT = EVT::getVectorVT(
6156 *DAG.getContext(), DstVT.getScalarType(),
6157 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
6158 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
6159 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
6160 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
6161}
6162
6163/// ExpandBITCAST - If the target supports VFP, this function is called to
6164/// expand a bit convert where either the source or destination type is i64 to
6165/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6166/// operand type is illegal (e.g., v2f32 for a target that doesn't support
6167/// vectors), since the legalizer won't know what to do with that.
6168SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
6169 const ARMSubtarget *Subtarget) const {
6170 SDLoc dl(N);
6171 SDValue Op = N->getOperand(0);
6172
6173 // This function is only supposed to be called for i16 and i64 types, either
6174 // as the source or destination of the bit convert.
6175 EVT SrcVT = Op.getValueType();
6176 EVT DstVT = N->getValueType(0);
6177
6178 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
6179 (DstVT == MVT::f16 || DstVT == MVT::bf16))
6180 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
6181 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
6182
6183 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
6184 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
6185 if (Subtarget->hasFullFP16() && !Subtarget->hasBF16())
6186 Op = DAG.getBitcast(MVT::f16, Op);
6187 return DAG.getNode(
6188 ISD::TRUNCATE, SDLoc(N), DstVT,
6189 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
6190 }
6191
6192 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
6193 return SDValue();
6194
6195 // Turn i64->f64 into VMOVDRR.
6196 if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
6197 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6198 // if we can combine the bitcast with its source.
6200 return Val;
6201 SDValue Lo, Hi;
6202 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
6203 return DAG.getNode(ISD::BITCAST, dl, DstVT,
6204 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
6205 }
6206
6207 // Turn f64->i64 into VMOVRRD.
6208 if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
6209 SDValue Cvt;
6210 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6211 SrcVT.getVectorNumElements() > 1)
6212 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6213 DAG.getVTList(MVT::i32, MVT::i32),
6214 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6215 else
6216 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6217 DAG.getVTList(MVT::i32, MVT::i32), Op);
6218 // Merge the pieces into a single i64 value.
6219 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6220 }
6221
6222 return SDValue();
6223}
6224
6225/// getZeroVector - Returns a vector of specified type with all zero elements.
6226/// Zero vectors are used to represent vector negation and in those cases
6227/// will be implemented with the NEON VNEG instruction. However, VNEG does
6228/// not support i64 elements, so sometimes the zero vectors will need to be
6229/// explicitly constructed. Regardless, use a canonical VMOV to create the
6230/// zero vector.
6231static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6232 assert(VT.isVector() && "Expected a vector type");
6233 // The canonical modified immediate encoding of a zero vector is....0!
6234 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6235 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6236 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6237 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6238}
6239
6240/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6241/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6242SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6243 SelectionDAG &DAG) const {
6244 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6245 EVT VT = Op.getValueType();
6246 unsigned VTBits = VT.getSizeInBits();
6247 SDLoc dl(Op);
6248 SDValue ShOpLo = Op.getOperand(0);
6249 SDValue ShOpHi = Op.getOperand(1);
6250 SDValue ShAmt = Op.getOperand(2);
6251 SDValue ARMcc;
6252 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6253
6254 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6255
6256 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6257 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6258 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6259 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6260 DAG.getConstant(VTBits, dl, MVT::i32));
6261 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6262 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6263 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6264 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6265 ISD::SETGE, ARMcc, DAG, dl);
6266 SDValue Lo =
6267 DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CmpLo);
6268
6269 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6270 SDValue HiBigShift = Opc == ISD::SRA
6271 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6272 DAG.getConstant(VTBits - 1, dl, VT))
6273 : DAG.getConstant(0, dl, VT);
6274 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6275 ISD::SETGE, ARMcc, DAG, dl);
6276 SDValue Hi =
6277 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6278
6279 SDValue Ops[2] = { Lo, Hi };
6280 return DAG.getMergeValues(Ops, dl);
6281}
6282
6283/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6284/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6285SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6286 SelectionDAG &DAG) const {
6287 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6288 EVT VT = Op.getValueType();
6289 unsigned VTBits = VT.getSizeInBits();
6290 SDLoc dl(Op);
6291 SDValue ShOpLo = Op.getOperand(0);
6292 SDValue ShOpHi = Op.getOperand(1);
6293 SDValue ShAmt = Op.getOperand(2);
6294 SDValue ARMcc;
6295
6296 assert(Op.getOpcode() == ISD::SHL_PARTS);
6297 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6298 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6299 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6300 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6301 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6302
6303 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6304 DAG.getConstant(VTBits, dl, MVT::i32));
6305 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6306 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6307 ISD::SETGE, ARMcc, DAG, dl);
6308 SDValue Hi =
6309 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6310
6311 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6312 ISD::SETGE, ARMcc, DAG, dl);
6313 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6314 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6315 DAG.getConstant(0, dl, VT), ARMcc, CmpLo);
6316
6317 SDValue Ops[2] = { Lo, Hi };
6318 return DAG.getMergeValues(Ops, dl);
6319}
6320
6321SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6322 SelectionDAG &DAG) const {
6323 // The rounding mode is in bits 23:22 of the FPSCR.
6324 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6325 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6326 // so that the shift + and get folded into a bitfield extract.
6327 SDLoc dl(Op);
6328 SDValue Chain = Op.getOperand(0);
6329 SDValue Ops[] = {Chain,
6330 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6331
6332 SDValue FPSCR =
6333 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6334 Chain = FPSCR.getValue(1);
6335 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6336 DAG.getConstant(1U << 22, dl, MVT::i32));
6337 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6338 DAG.getConstant(22, dl, MVT::i32));
6339 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6340 DAG.getConstant(3, dl, MVT::i32));
6341 return DAG.getMergeValues({And, Chain}, dl);
6342}
6343
6344SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6345 SelectionDAG &DAG) const {
6346 SDLoc DL(Op);
6347 SDValue Chain = Op->getOperand(0);
6348 SDValue RMValue = Op->getOperand(1);
6349
6350 // The rounding mode is in bits 23:22 of the FPSCR.
6351 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6352 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6353 // ((arg - 1) & 3) << 22).
6354 //
6355 // It is expected that the argument of llvm.set.rounding is within the
6356 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6357 // responsibility of the code generated llvm.set.rounding to ensure this
6358 // condition.
6359
6360 // Calculate new value of FPSCR[23:22].
6361 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6362 DAG.getConstant(1, DL, MVT::i32));
6363 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6364 DAG.getConstant(0x3, DL, MVT::i32));
6365 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6366 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6367
6368 // Get current value of FPSCR.
6369 SDValue Ops[] = {Chain,
6370 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6371 SDValue FPSCR =
6372 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6373 Chain = FPSCR.getValue(1);
6374 FPSCR = FPSCR.getValue(0);
6375
6376 // Put new rounding mode into FPSCR[23:22].
6377 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6378 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6379 DAG.getConstant(RMMask, DL, MVT::i32));
6380 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6381 SDValue Ops2[] = {
6382 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6383 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6384}
6385
6386SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6387 SelectionDAG &DAG) const {
6388 SDLoc DL(Op);
6389 SDValue Chain = Op->getOperand(0);
6390 SDValue Mode = Op->getOperand(1);
6391
6392 // Generate nodes to build:
6393 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6394 SDValue Ops[] = {Chain,
6395 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6396 SDValue FPSCR =
6397 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6398 Chain = FPSCR.getValue(1);
6399 FPSCR = FPSCR.getValue(0);
6400
6401 SDValue FPSCRMasked =
6402 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6403 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6404 SDValue InputMasked =
6405 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6406 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6407 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6408
6409 SDValue Ops2[] = {
6410 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6411 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6412}
6413
6414SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6415 SelectionDAG &DAG) const {
6416 SDLoc DL(Op);
6417 SDValue Chain = Op->getOperand(0);
6418
6419 // To get the default FP mode all control bits are cleared:
6420 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6421 SDValue Ops[] = {Chain,
6422 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6423 SDValue FPSCR =
6424 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6425 Chain = FPSCR.getValue(1);
6426 FPSCR = FPSCR.getValue(0);
6427
6428 SDValue FPSCRMasked = DAG.getNode(
6429 ISD::AND, DL, MVT::i32, FPSCR,
6431 SDValue Ops2[] = {Chain,
6432 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6433 FPSCRMasked};
6434 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6435}
6436
6438 const ARMSubtarget *ST) {
6439 SDLoc dl(N);
6440 EVT VT = N->getValueType(0);
6441 if (VT.isVector() && ST->hasNEON()) {
6442
6443 // Compute the least significant set bit: LSB = X & -X
6444 SDValue X = N->getOperand(0);
6445 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6446 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6447
6448 EVT ElemTy = VT.getVectorElementType();
6449
6450 if (ElemTy == MVT::i8) {
6451 // Compute with: cttz(x) = ctpop(lsb - 1)
6452 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6453 DAG.getTargetConstant(1, dl, ElemTy));
6454 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6455 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6456 }
6457
6458 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6459 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6460 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6461 unsigned NumBits = ElemTy.getSizeInBits();
6462 SDValue WidthMinus1 =
6463 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6464 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6465 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6466 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6467 }
6468
6469 // Compute with: cttz(x) = ctpop(lsb - 1)
6470
6471 // Compute LSB - 1.
6472 SDValue Bits;
6473 if (ElemTy == MVT::i64) {
6474 // Load constant 0xffff'ffff'ffff'ffff to register.
6475 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6476 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6477 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6478 } else {
6479 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6480 DAG.getTargetConstant(1, dl, ElemTy));
6481 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6482 }
6483 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6484 }
6485
6486 if (!ST->hasV6T2Ops())
6487 return SDValue();
6488
6489 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6490 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6491}
6492
6494 const ARMSubtarget *ST) {
6495 EVT VT = N->getValueType(0);
6496 SDLoc DL(N);
6497
6498 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6499 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6500 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6501 "Unexpected type for custom ctpop lowering");
6502
6503 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6504 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6505 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6506 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6507
6508 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6509 unsigned EltSize = 8;
6510 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6511 while (EltSize != VT.getScalarSizeInBits()) {
6513 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6514 TLI.getPointerTy(DAG.getDataLayout())));
6515 Ops.push_back(Res);
6516
6517 EltSize *= 2;
6518 NumElts /= 2;
6519 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6520 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6521 }
6522
6523 return Res;
6524}
6525
6526/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6527/// operand of a vector shift operation, where all the elements of the
6528/// build_vector must have the same constant integer value.
6529static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6530 // Ignore bit_converts.
6531 while (Op.getOpcode() == ISD::BITCAST)
6532 Op = Op.getOperand(0);
6534 APInt SplatBits, SplatUndef;
6535 unsigned SplatBitSize;
6536 bool HasAnyUndefs;
6537 if (!BVN ||
6538 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6539 ElementBits) ||
6540 SplatBitSize > ElementBits)
6541 return false;
6542 Cnt = SplatBits.getSExtValue();
6543 return true;
6544}
6545
6546/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6547/// operand of a vector shift left operation. That value must be in the range:
6548/// 0 <= Value < ElementBits for a left shift; or
6549/// 0 <= Value <= ElementBits for a long left shift.
6550static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6551 assert(VT.isVector() && "vector shift count is not a vector type");
6552 int64_t ElementBits = VT.getScalarSizeInBits();
6553 if (!getVShiftImm(Op, ElementBits, Cnt))
6554 return false;
6555 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6556}
6557
6558/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6559/// operand of a vector shift right operation. For a shift opcode, the value
6560/// is positive, but for an intrinsic the value count must be negative. The
6561/// absolute value must be in the range:
6562/// 1 <= |Value| <= ElementBits for a right shift; or
6563/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6564static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6565 int64_t &Cnt) {
6566 assert(VT.isVector() && "vector shift count is not a vector type");
6567 int64_t ElementBits = VT.getScalarSizeInBits();
6568 if (!getVShiftImm(Op, ElementBits, Cnt))
6569 return false;
6570 if (!isIntrinsic)
6571 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6572 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6573 Cnt = -Cnt;
6574 return true;
6575 }
6576 return false;
6577}
6578
6580 const ARMSubtarget *ST) {
6581 EVT VT = N->getValueType(0);
6582 SDLoc dl(N);
6583 int64_t Cnt;
6584
6585 if (!VT.isVector())
6586 return SDValue();
6587
6588 // We essentially have two forms here. Shift by an immediate and shift by a
6589 // vector register (there are also shift by a gpr, but that is just handled
6590 // with a tablegen pattern). We cannot easily match shift by an immediate in
6591 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6592 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6593 // signed or unsigned, and a negative shift indicates a shift right).
6594 if (N->getOpcode() == ISD::SHL) {
6595 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6596 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6597 DAG.getConstant(Cnt, dl, MVT::i32));
6598 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6599 N->getOperand(1));
6600 }
6601
6602 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6603 "unexpected vector shift opcode");
6604
6605 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6606 unsigned VShiftOpc =
6607 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6608 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6609 DAG.getConstant(Cnt, dl, MVT::i32));
6610 }
6611
6612 // Other right shifts we don't have operations for (we use a shift left by a
6613 // negative number).
6614 EVT ShiftVT = N->getOperand(1).getValueType();
6615 SDValue NegatedCount = DAG.getNode(
6616 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6617 unsigned VShiftOpc =
6618 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6619 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6620}
6621
6623 const ARMSubtarget *ST) {
6624 EVT VT = N->getValueType(0);
6625 SDLoc dl(N);
6626
6627 // We can get here for a node like i32 = ISD::SHL i32, i64
6628 if (VT != MVT::i64)
6629 return SDValue();
6630
6631 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6632 N->getOpcode() == ISD::SHL) &&
6633 "Unknown shift to lower!");
6634
6635 unsigned ShOpc = N->getOpcode();
6636 if (ST->hasMVEIntegerOps()) {
6637 SDValue ShAmt = N->getOperand(1);
6638 unsigned ShPartsOpc = ARMISD::LSLL;
6640
6641 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6642 // then do the default optimisation
6643 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6644 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6645 return SDValue();
6646
6647 // Extract the lower 32 bits of the shift amount if it's not an i32
6648 if (ShAmt->getValueType(0) != MVT::i32)
6649 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6650
6651 if (ShOpc == ISD::SRL) {
6652 if (!Con)
6653 // There is no t2LSRLr instruction so negate and perform an lsll if the
6654 // shift amount is in a register, emulating a right shift.
6655 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6656 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6657 else
6658 // Else generate an lsrl on the immediate shift amount
6659 ShPartsOpc = ARMISD::LSRL;
6660 } else if (ShOpc == ISD::SRA)
6661 ShPartsOpc = ARMISD::ASRL;
6662
6663 // Split Lower/Upper 32 bits of the destination/source
6664 SDValue Lo, Hi;
6665 std::tie(Lo, Hi) =
6666 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6667 // Generate the shift operation as computed above
6668 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6669 ShAmt);
6670 // The upper 32 bits come from the second return value of lsll
6671 Hi = SDValue(Lo.getNode(), 1);
6672 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6673 }
6674
6675 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6676 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6677 return SDValue();
6678
6679 // If we are in thumb mode, we don't have RRX.
6680 if (ST->isThumb1Only())
6681 return SDValue();
6682
6683 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6684 SDValue Lo, Hi;
6685 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6686
6687 // First, build a LSRS1/ASRS1 op, which shifts the top part by one and
6688 // captures the shifted out bit into a carry flag.
6689 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::LSRS1 : ARMISD::ASRS1;
6690 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, FlagsVT), Hi);
6691
6692 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6693 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6694
6695 // Merge the pieces into a single i64 value.
6696 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6697}
6698
6700 const ARMSubtarget *ST) {
6701 bool Invert = false;
6702 bool Swap = false;
6703 unsigned Opc = ARMCC::AL;
6704
6705 SDValue Op0 = Op.getOperand(0);
6706 SDValue Op1 = Op.getOperand(1);
6707 SDValue CC = Op.getOperand(2);
6708 EVT VT = Op.getValueType();
6709 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6710 SDLoc dl(Op);
6711
6712 EVT CmpVT;
6713 if (ST->hasNEON())
6715 else {
6716 assert(ST->hasMVEIntegerOps() &&
6717 "No hardware support for integer vector comparison!");
6718
6719 if (Op.getValueType().getVectorElementType() != MVT::i1)
6720 return SDValue();
6721
6722 // Make sure we expand floating point setcc to scalar if we do not have
6723 // mve.fp, so that we can handle them from there.
6724 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6725 return SDValue();
6726
6727 CmpVT = VT;
6728 }
6729
6730 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6731 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6732 // Special-case integer 64-bit equality comparisons. They aren't legal,
6733 // but they can be lowered with a few vector instructions.
6734 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6735 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6736 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6737 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6738 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6739 DAG.getCondCode(ISD::SETEQ));
6740 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6741 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6742 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6743 if (SetCCOpcode == ISD::SETNE)
6744 Merged = DAG.getNOT(dl, Merged, CmpVT);
6745 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6746 return Merged;
6747 }
6748
6749 if (CmpVT.getVectorElementType() == MVT::i64)
6750 // 64-bit comparisons are not legal in general.
6751 return SDValue();
6752
6753 if (Op1.getValueType().isFloatingPoint()) {
6754 switch (SetCCOpcode) {
6755 default: llvm_unreachable("Illegal FP comparison");
6756 case ISD::SETUNE:
6757 case ISD::SETNE:
6758 if (ST->hasMVEFloatOps()) {
6759 Opc = ARMCC::NE; break;
6760 } else {
6761 Invert = true; [[fallthrough]];
6762 }
6763 case ISD::SETOEQ:
6764 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6765 case ISD::SETOLT:
6766 case ISD::SETLT: Swap = true; [[fallthrough]];
6767 case ISD::SETOGT:
6768 case ISD::SETGT: Opc = ARMCC::GT; break;
6769 case ISD::SETOLE:
6770 case ISD::SETLE: Swap = true; [[fallthrough]];
6771 case ISD::SETOGE:
6772 case ISD::SETGE: Opc = ARMCC::GE; break;
6773 case ISD::SETUGE: Swap = true; [[fallthrough]];
6774 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6775 case ISD::SETUGT: Swap = true; [[fallthrough]];
6776 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6777 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6778 case ISD::SETONE: {
6779 // Expand this to (OLT | OGT).
6780 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6781 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6782 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6783 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6784 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6785 if (Invert)
6786 Result = DAG.getNOT(dl, Result, VT);
6787 return Result;
6788 }
6789 case ISD::SETUO: Invert = true; [[fallthrough]];
6790 case ISD::SETO: {
6791 // Expand this to (OLT | OGE).
6792 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6793 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6794 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6795 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6796 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6797 if (Invert)
6798 Result = DAG.getNOT(dl, Result, VT);
6799 return Result;
6800 }
6801 }
6802 } else {
6803 // Integer comparisons.
6804 switch (SetCCOpcode) {
6805 default: llvm_unreachable("Illegal integer comparison");
6806 case ISD::SETNE:
6807 if (ST->hasMVEIntegerOps()) {
6808 Opc = ARMCC::NE; break;
6809 } else {
6810 Invert = true; [[fallthrough]];
6811 }
6812 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6813 case ISD::SETLT: Swap = true; [[fallthrough]];
6814 case ISD::SETGT: Opc = ARMCC::GT; break;
6815 case ISD::SETLE: Swap = true; [[fallthrough]];
6816 case ISD::SETGE: Opc = ARMCC::GE; break;
6817 case ISD::SETULT: Swap = true; [[fallthrough]];
6818 case ISD::SETUGT: Opc = ARMCC::HI; break;
6819 case ISD::SETULE: Swap = true; [[fallthrough]];
6820 case ISD::SETUGE: Opc = ARMCC::HS; break;
6821 }
6822
6823 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6824 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6825 SDValue AndOp;
6827 AndOp = Op0;
6828 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6829 AndOp = Op1;
6830
6831 // Ignore bitconvert.
6832 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6833 AndOp = AndOp.getOperand(0);
6834
6835 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6836 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6837 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6838 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6839 if (!Invert)
6840 Result = DAG.getNOT(dl, Result, VT);
6841 return Result;
6842 }
6843 }
6844 }
6845
6846 if (Swap)
6847 std::swap(Op0, Op1);
6848
6849 // If one of the operands is a constant vector zero, attempt to fold the
6850 // comparison to a specialized compare-against-zero form.
6852 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6853 Opc == ARMCC::NE)) {
6854 if (Opc == ARMCC::GE)
6855 Opc = ARMCC::LE;
6856 else if (Opc == ARMCC::GT)
6857 Opc = ARMCC::LT;
6858 std::swap(Op0, Op1);
6859 }
6860
6861 SDValue Result;
6863 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6864 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6865 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6866 DAG.getConstant(Opc, dl, MVT::i32));
6867 else
6868 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6869 DAG.getConstant(Opc, dl, MVT::i32));
6870
6871 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6872
6873 if (Invert)
6874 Result = DAG.getNOT(dl, Result, VT);
6875
6876 return Result;
6877}
6878
6880 SDValue LHS = Op.getOperand(0);
6881 SDValue RHS = Op.getOperand(1);
6882 SDValue Carry = Op.getOperand(2);
6883 SDValue Cond = Op.getOperand(3);
6884 SDLoc DL(Op);
6885
6886 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6887
6888 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6889 // have to invert the carry first.
6890 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6891 DAG.getConstant(1, DL, MVT::i32), Carry);
6892 // This converts the boolean value carry into the carry flag.
6893 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6894
6895 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6896 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6897
6898 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6899 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6900 SDValue ARMcc = DAG.getConstant(
6901 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6902 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6903 Cmp.getValue(1));
6904}
6905
6906/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6907/// valid vector constant for a NEON or MVE instruction with a "modified
6908/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6909static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6910 unsigned SplatBitSize, SelectionDAG &DAG,
6911 const SDLoc &dl, EVT &VT, EVT VectorVT,
6912 VMOVModImmType type) {
6913 unsigned OpCmode, Imm;
6914 bool is128Bits = VectorVT.is128BitVector();
6915
6916 // SplatBitSize is set to the smallest size that splats the vector, so a
6917 // zero vector will always have SplatBitSize == 8. However, NEON modified
6918 // immediate instructions others than VMOV do not support the 8-bit encoding
6919 // of a zero vector, and the default encoding of zero is supposed to be the
6920 // 32-bit version.
6921 if (SplatBits == 0)
6922 SplatBitSize = 32;
6923
6924 switch (SplatBitSize) {
6925 case 8:
6926 if (type != VMOVModImm)
6927 return SDValue();
6928 // Any 1-byte value is OK. Op=0, Cmode=1110.
6929 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6930 OpCmode = 0xe;
6931 Imm = SplatBits;
6932 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6933 break;
6934
6935 case 16:
6936 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6937 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6938 if ((SplatBits & ~0xff) == 0) {
6939 // Value = 0x00nn: Op=x, Cmode=100x.
6940 OpCmode = 0x8;
6941 Imm = SplatBits;
6942 break;
6943 }
6944 if ((SplatBits & ~0xff00) == 0) {
6945 // Value = 0xnn00: Op=x, Cmode=101x.
6946 OpCmode = 0xa;
6947 Imm = SplatBits >> 8;
6948 break;
6949 }
6950 return SDValue();
6951
6952 case 32:
6953 // NEON's 32-bit VMOV supports splat values where:
6954 // * only one byte is nonzero, or
6955 // * the least significant byte is 0xff and the second byte is nonzero, or
6956 // * the least significant 2 bytes are 0xff and the third is nonzero.
6957 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6958 if ((SplatBits & ~0xff) == 0) {
6959 // Value = 0x000000nn: Op=x, Cmode=000x.
6960 OpCmode = 0;
6961 Imm = SplatBits;
6962 break;
6963 }
6964 if ((SplatBits & ~0xff00) == 0) {
6965 // Value = 0x0000nn00: Op=x, Cmode=001x.
6966 OpCmode = 0x2;
6967 Imm = SplatBits >> 8;
6968 break;
6969 }
6970 if ((SplatBits & ~0xff0000) == 0) {
6971 // Value = 0x00nn0000: Op=x, Cmode=010x.
6972 OpCmode = 0x4;
6973 Imm = SplatBits >> 16;
6974 break;
6975 }
6976 if ((SplatBits & ~0xff000000) == 0) {
6977 // Value = 0xnn000000: Op=x, Cmode=011x.
6978 OpCmode = 0x6;
6979 Imm = SplatBits >> 24;
6980 break;
6981 }
6982
6983 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6984 if (type == OtherModImm) return SDValue();
6985
6986 if ((SplatBits & ~0xffff) == 0 &&
6987 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6988 // Value = 0x0000nnff: Op=x, Cmode=1100.
6989 OpCmode = 0xc;
6990 Imm = SplatBits >> 8;
6991 break;
6992 }
6993
6994 // cmode == 0b1101 is not supported for MVE VMVN
6995 if (type == MVEVMVNModImm)
6996 return SDValue();
6997
6998 if ((SplatBits & ~0xffffff) == 0 &&
6999 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
7000 // Value = 0x00nnffff: Op=x, Cmode=1101.
7001 OpCmode = 0xd;
7002 Imm = SplatBits >> 16;
7003 break;
7004 }
7005
7006 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
7007 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
7008 // VMOV.I32. A (very) minor optimization would be to replicate the value
7009 // and fall through here to test for a valid 64-bit splat. But, then the
7010 // caller would also need to check and handle the change in size.
7011 return SDValue();
7012
7013 case 64: {
7014 if (type != VMOVModImm)
7015 return SDValue();
7016 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
7017 uint64_t BitMask = 0xff;
7018 unsigned ImmMask = 1;
7019 Imm = 0;
7020 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
7021 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
7022 Imm |= ImmMask;
7023 } else if ((SplatBits & BitMask) != 0) {
7024 return SDValue();
7025 }
7026 BitMask <<= 8;
7027 ImmMask <<= 1;
7028 }
7029
7030 // Op=1, Cmode=1110.
7031 OpCmode = 0x1e;
7032 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
7033 break;
7034 }
7035
7036 default:
7037 llvm_unreachable("unexpected size for isVMOVModifiedImm");
7038 }
7039
7040 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
7041 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
7042}
7043
7044SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
7045 const ARMSubtarget *ST) const {
7046 EVT VT = Op.getValueType();
7047 bool IsDouble = (VT == MVT::f64);
7048 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
7049 const APFloat &FPVal = CFP->getValueAPF();
7050
7051 // Prevent floating-point constants from using literal loads
7052 // when execute-only is enabled.
7053 if (ST->genExecuteOnly()) {
7054 // We shouldn't trigger this for v6m execute-only
7055 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
7056 "Unexpected architecture");
7057
7058 // If we can represent the constant as an immediate, don't lower it
7059 if (isFPImmLegal(FPVal, VT))
7060 return Op;
7061 // Otherwise, construct as integer, and move to float register
7062 APInt INTVal = FPVal.bitcastToAPInt();
7063 SDLoc DL(CFP);
7064 switch (VT.getSimpleVT().SimpleTy) {
7065 default:
7066 llvm_unreachable("Unknown floating point type!");
7067 break;
7068 case MVT::f64: {
7069 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
7070 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
7071 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
7072 }
7073 case MVT::f32:
7074 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
7075 DAG.getConstant(INTVal, DL, MVT::i32));
7076 }
7077 }
7078
7079 if (!ST->hasVFP3Base())
7080 return SDValue();
7081
7082 // Use the default (constant pool) lowering for double constants when we have
7083 // an SP-only FPU
7084 if (IsDouble && !Subtarget->hasFP64())
7085 return SDValue();
7086
7087 // Try splatting with a VMOV.f32...
7088 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
7089
7090 if (ImmVal != -1) {
7091 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
7092 // We have code in place to select a valid ConstantFP already, no need to
7093 // do any mangling.
7094 return Op;
7095 }
7096
7097 // It's a float and we are trying to use NEON operations where
7098 // possible. Lower it to a splat followed by an extract.
7099 SDLoc DL(Op);
7100 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
7101 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
7102 NewVal);
7103 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
7104 DAG.getConstant(0, DL, MVT::i32));
7105 }
7106
7107 // The rest of our options are NEON only, make sure that's allowed before
7108 // proceeding..
7109 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
7110 return SDValue();
7111
7112 EVT VMovVT;
7113 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
7114
7115 // It wouldn't really be worth bothering for doubles except for one very
7116 // important value, which does happen to match: 0.0. So make sure we don't do
7117 // anything stupid.
7118 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
7119 return SDValue();
7120
7121 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
7122 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
7123 VMovVT, VT, VMOVModImm);
7124 if (NewVal != SDValue()) {
7125 SDLoc DL(Op);
7126 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
7127 NewVal);
7128 if (IsDouble)
7129 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7130
7131 // It's a float: cast and extract a vector element.
7132 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7133 VecConstant);
7134 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7135 DAG.getConstant(0, DL, MVT::i32));
7136 }
7137
7138 // Finally, try a VMVN.i32
7139 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
7140 VT, VMVNModImm);
7141 if (NewVal != SDValue()) {
7142 SDLoc DL(Op);
7143 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
7144
7145 if (IsDouble)
7146 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7147
7148 // It's a float: cast and extract a vector element.
7149 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7150 VecConstant);
7151 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7152 DAG.getConstant(0, DL, MVT::i32));
7153 }
7154
7155 return SDValue();
7156}
7157
7158// check if an VEXT instruction can handle the shuffle mask when the
7159// vector sources of the shuffle are the same.
7160static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7161 unsigned NumElts = VT.getVectorNumElements();
7162
7163 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7164 if (M[0] < 0)
7165 return false;
7166
7167 Imm = M[0];
7168
7169 // If this is a VEXT shuffle, the immediate value is the index of the first
7170 // element. The other shuffle indices must be the successive elements after
7171 // the first one.
7172 unsigned ExpectedElt = Imm;
7173 for (unsigned i = 1; i < NumElts; ++i) {
7174 // Increment the expected index. If it wraps around, just follow it
7175 // back to index zero and keep going.
7176 ++ExpectedElt;
7177 if (ExpectedElt == NumElts)
7178 ExpectedElt = 0;
7179
7180 if (M[i] < 0) continue; // ignore UNDEF indices
7181 if (ExpectedElt != static_cast<unsigned>(M[i]))
7182 return false;
7183 }
7184
7185 return true;
7186}
7187
7188static bool isVEXTMask(ArrayRef<int> M, EVT VT,
7189 bool &ReverseVEXT, unsigned &Imm) {
7190 unsigned NumElts = VT.getVectorNumElements();
7191 ReverseVEXT = false;
7192
7193 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7194 if (M[0] < 0)
7195 return false;
7196
7197 Imm = M[0];
7198
7199 // If this is a VEXT shuffle, the immediate value is the index of the first
7200 // element. The other shuffle indices must be the successive elements after
7201 // the first one.
7202 unsigned ExpectedElt = Imm;
7203 for (unsigned i = 1; i < NumElts; ++i) {
7204 // Increment the expected index. If it wraps around, it may still be
7205 // a VEXT but the source vectors must be swapped.
7206 ExpectedElt += 1;
7207 if (ExpectedElt == NumElts * 2) {
7208 ExpectedElt = 0;
7209 ReverseVEXT = true;
7210 }
7211
7212 if (M[i] < 0) continue; // ignore UNDEF indices
7213 if (ExpectedElt != static_cast<unsigned>(M[i]))
7214 return false;
7215 }
7216
7217 // Adjust the index value if the source operands will be swapped.
7218 if (ReverseVEXT)
7219 Imm -= NumElts;
7220
7221 return true;
7222}
7223
7224static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7225 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7226 // range, then 0 is placed into the resulting vector. So pretty much any mask
7227 // of 8 elements can work here.
7228 return VT == MVT::v8i8 && M.size() == 8;
7229}
7230
7231static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7232 unsigned Index) {
7233 if (Mask.size() == Elements * 2)
7234 return Index / Elements;
7235 return Mask[Index] == 0 ? 0 : 1;
7236}
7237
7238// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7239// checking that pairs of elements in the shuffle mask represent the same index
7240// in each vector, incrementing the expected index by 2 at each step.
7241// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7242// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7243// v2={e,f,g,h}
7244// WhichResult gives the offset for each element in the mask based on which
7245// of the two results it belongs to.
7246//
7247// The transpose can be represented either as:
7248// result1 = shufflevector v1, v2, result1_shuffle_mask
7249// result2 = shufflevector v1, v2, result2_shuffle_mask
7250// where v1/v2 and the shuffle masks have the same number of elements
7251// (here WhichResult (see below) indicates which result is being checked)
7252//
7253// or as:
7254// results = shufflevector v1, v2, shuffle_mask
7255// where both results are returned in one vector and the shuffle mask has twice
7256// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7257// want to check the low half and high half of the shuffle mask as if it were
7258// the other case
7259static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7260 unsigned EltSz = VT.getScalarSizeInBits();
7261 if (EltSz == 64)
7262 return false;
7263
7264 unsigned NumElts = VT.getVectorNumElements();
7265 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7266 return false;
7267
7268 // If the mask is twice as long as the input vector then we need to check the
7269 // upper and lower parts of the mask with a matching value for WhichResult
7270 // FIXME: A mask with only even values will be rejected in case the first
7271 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7272 // M[0] is used to determine WhichResult
7273 for (unsigned i = 0; i < M.size(); i += NumElts) {
7274 WhichResult = SelectPairHalf(NumElts, M, i);
7275 for (unsigned j = 0; j < NumElts; j += 2) {
7276 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7277 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7278 return false;
7279 }
7280 }
7281
7282 if (M.size() == NumElts*2)
7283 WhichResult = 0;
7284
7285 return true;
7286}
7287
7288/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7289/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7290/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7291static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7292 unsigned EltSz = VT.getScalarSizeInBits();
7293 if (EltSz == 64)
7294 return false;
7295
7296 unsigned NumElts = VT.getVectorNumElements();
7297 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7298 return false;
7299
7300 for (unsigned i = 0; i < M.size(); i += NumElts) {
7301 WhichResult = SelectPairHalf(NumElts, M, i);
7302 for (unsigned j = 0; j < NumElts; j += 2) {
7303 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7304 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7305 return false;
7306 }
7307 }
7308
7309 if (M.size() == NumElts*2)
7310 WhichResult = 0;
7311
7312 return true;
7313}
7314
7315// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7316// that the mask elements are either all even and in steps of size 2 or all odd
7317// and in steps of size 2.
7318// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7319// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7320// v2={e,f,g,h}
7321// Requires similar checks to that of isVTRNMask with
7322// respect the how results are returned.
7323static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7324 unsigned EltSz = VT.getScalarSizeInBits();
7325 if (EltSz == 64)
7326 return false;
7327
7328 unsigned NumElts = VT.getVectorNumElements();
7329 if (M.size() != NumElts && M.size() != NumElts*2)
7330 return false;
7331
7332 for (unsigned i = 0; i < M.size(); i += NumElts) {
7333 WhichResult = SelectPairHalf(NumElts, M, i);
7334 for (unsigned j = 0; j < NumElts; ++j) {
7335 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7336 return false;
7337 }
7338 }
7339
7340 if (M.size() == NumElts*2)
7341 WhichResult = 0;
7342
7343 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7344 if (VT.is64BitVector() && EltSz == 32)
7345 return false;
7346
7347 return true;
7348}
7349
7350/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7351/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7352/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7353static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7354 unsigned EltSz = VT.getScalarSizeInBits();
7355 if (EltSz == 64)
7356 return false;
7357
7358 unsigned NumElts = VT.getVectorNumElements();
7359 if (M.size() != NumElts && M.size() != NumElts*2)
7360 return false;
7361
7362 unsigned Half = NumElts / 2;
7363 for (unsigned i = 0; i < M.size(); i += NumElts) {
7364 WhichResult = SelectPairHalf(NumElts, M, i);
7365 for (unsigned j = 0; j < NumElts; j += Half) {
7366 unsigned Idx = WhichResult;
7367 for (unsigned k = 0; k < Half; ++k) {
7368 int MIdx = M[i + j + k];
7369 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7370 return false;
7371 Idx += 2;
7372 }
7373 }
7374 }
7375
7376 if (M.size() == NumElts*2)
7377 WhichResult = 0;
7378
7379 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7380 if (VT.is64BitVector() && EltSz == 32)
7381 return false;
7382
7383 return true;
7384}
7385
7386// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7387// that pairs of elements of the shufflemask represent the same index in each
7388// vector incrementing sequentially through the vectors.
7389// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7390// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7391// v2={e,f,g,h}
7392// Requires similar checks to that of isVTRNMask with respect the how results
7393// are returned.
7394static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7395 unsigned EltSz = VT.getScalarSizeInBits();
7396 if (EltSz == 64)
7397 return false;
7398
7399 unsigned NumElts = VT.getVectorNumElements();
7400 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7401 return false;
7402
7403 for (unsigned i = 0; i < M.size(); i += NumElts) {
7404 WhichResult = SelectPairHalf(NumElts, M, i);
7405 unsigned Idx = WhichResult * NumElts / 2;
7406 for (unsigned j = 0; j < NumElts; j += 2) {
7407 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7408 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7409 return false;
7410 Idx += 1;
7411 }
7412 }
7413
7414 if (M.size() == NumElts*2)
7415 WhichResult = 0;
7416
7417 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7418 if (VT.is64BitVector() && EltSz == 32)
7419 return false;
7420
7421 return true;
7422}
7423
7424/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7425/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7426/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7427static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7428 unsigned EltSz = VT.getScalarSizeInBits();
7429 if (EltSz == 64)
7430 return false;
7431
7432 unsigned NumElts = VT.getVectorNumElements();
7433 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7434 return false;
7435
7436 for (unsigned i = 0; i < M.size(); i += NumElts) {
7437 WhichResult = SelectPairHalf(NumElts, M, i);
7438 unsigned Idx = WhichResult * NumElts / 2;
7439 for (unsigned j = 0; j < NumElts; j += 2) {
7440 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7441 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7442 return false;
7443 Idx += 1;
7444 }
7445 }
7446
7447 if (M.size() == NumElts*2)
7448 WhichResult = 0;
7449
7450 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7451 if (VT.is64BitVector() && EltSz == 32)
7452 return false;
7453
7454 return true;
7455}
7456
7457/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7458/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7459static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7460 unsigned &WhichResult,
7461 bool &isV_UNDEF) {
7462 isV_UNDEF = false;
7463 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7464 return ARMISD::VTRN;
7465 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7466 return ARMISD::VUZP;
7467 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7468 return ARMISD::VZIP;
7469
7470 isV_UNDEF = true;
7471 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7472 return ARMISD::VTRN;
7473 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7474 return ARMISD::VUZP;
7475 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7476 return ARMISD::VZIP;
7477
7478 return 0;
7479}
7480
7481/// \return true if this is a reverse operation on an vector.
7482static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7483 unsigned NumElts = VT.getVectorNumElements();
7484 // Make sure the mask has the right size.
7485 if (NumElts != M.size())
7486 return false;
7487
7488 // Look for <15, ..., 3, -1, 1, 0>.
7489 for (unsigned i = 0; i != NumElts; ++i)
7490 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7491 return false;
7492
7493 return true;
7494}
7495
7496static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7497 unsigned NumElts = VT.getVectorNumElements();
7498 // Make sure the mask has the right size.
7499 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7500 return false;
7501
7502 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7503 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7504 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7505 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7506 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7507 int Ofs = Top ? 1 : 0;
7508 int Upper = SingleSource ? 0 : NumElts;
7509 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7510 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7511 return false;
7512 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7513 return false;
7514 }
7515 return true;
7516}
7517
7518static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7519 unsigned NumElts = VT.getVectorNumElements();
7520 // Make sure the mask has the right size.
7521 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7522 return false;
7523
7524 // If Top
7525 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7526 // This inserts Input2 into Input1
7527 // else if not Top
7528 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7529 // This inserts Input1 into Input2
7530 unsigned Offset = Top ? 0 : 1;
7531 unsigned N = SingleSource ? 0 : NumElts;
7532 for (unsigned i = 0; i < NumElts; i += 2) {
7533 if (M[i] >= 0 && M[i] != (int)i)
7534 return false;
7535 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7536 return false;
7537 }
7538
7539 return true;
7540}
7541
7542static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7543 unsigned NumElts = ToVT.getVectorNumElements();
7544 if (NumElts != M.size())
7545 return false;
7546
7547 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7548 // looking for patterns of:
7549 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7550 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7551
7552 unsigned Off0 = rev ? NumElts / 2 : 0;
7553 unsigned Off1 = rev ? 0 : NumElts / 2;
7554 for (unsigned i = 0; i < NumElts; i += 2) {
7555 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7556 return false;
7557 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7558 return false;
7559 }
7560
7561 return true;
7562}
7563
7564// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7565// from a pair of inputs. For example:
7566// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7567// FP_ROUND(EXTRACT_ELT(Y, 0),
7568// FP_ROUND(EXTRACT_ELT(X, 1),
7569// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7571 const ARMSubtarget *ST) {
7572 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7573 if (!ST->hasMVEFloatOps())
7574 return SDValue();
7575
7576 SDLoc dl(BV);
7577 EVT VT = BV.getValueType();
7578 if (VT != MVT::v8f16)
7579 return SDValue();
7580
7581 // We are looking for a buildvector of fptrunc elements, where all the
7582 // elements are interleavingly extracted from two sources. Check the first two
7583 // items are valid enough and extract some info from them (they are checked
7584 // properly in the loop below).
7585 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7588 return SDValue();
7589 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7592 return SDValue();
7593 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7594 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7595 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7596 return SDValue();
7597
7598 // Check all the values in the BuildVector line up with our expectations.
7599 for (unsigned i = 1; i < 4; i++) {
7600 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7601 return Trunc.getOpcode() == ISD::FP_ROUND &&
7603 Trunc.getOperand(0).getOperand(0) == Op &&
7604 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7605 };
7606 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7607 return SDValue();
7608 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7609 return SDValue();
7610 }
7611
7612 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7613 DAG.getConstant(0, dl, MVT::i32));
7614 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7615 DAG.getConstant(1, dl, MVT::i32));
7616}
7617
7618// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7619// from a single input on alternating lanes. For example:
7620// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7621// FP_ROUND(EXTRACT_ELT(X, 2),
7622// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7624 const ARMSubtarget *ST) {
7625 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7626 if (!ST->hasMVEFloatOps())
7627 return SDValue();
7628
7629 SDLoc dl(BV);
7630 EVT VT = BV.getValueType();
7631 if (VT != MVT::v4f32)
7632 return SDValue();
7633
7634 // We are looking for a buildvector of fptext elements, where all the
7635 // elements are alternating lanes from a single source. For example <0,2,4,6>
7636 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7637 // info from them (they are checked properly in the loop below).
7638 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7640 return SDValue();
7641 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7643 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7644 return SDValue();
7645
7646 // Check all the values in the BuildVector line up with our expectations.
7647 for (unsigned i = 1; i < 4; i++) {
7648 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7649 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7651 Trunc.getOperand(0).getOperand(0) == Op &&
7652 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7653 };
7654 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7655 return SDValue();
7656 }
7657
7658 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7659 DAG.getConstant(Offset, dl, MVT::i32));
7660}
7661
7662// If N is an integer constant that can be moved into a register in one
7663// instruction, return an SDValue of such a constant (will become a MOV
7664// instruction). Otherwise return null.
7666 const ARMSubtarget *ST, const SDLoc &dl) {
7667 uint64_t Val;
7668 if (!isa<ConstantSDNode>(N))
7669 return SDValue();
7670 Val = N->getAsZExtVal();
7671
7672 if (ST->isThumb1Only()) {
7673 if (Val <= 255 || ~Val <= 255)
7674 return DAG.getConstant(Val, dl, MVT::i32);
7675 } else {
7676 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7677 return DAG.getConstant(Val, dl, MVT::i32);
7678 }
7679 return SDValue();
7680}
7681
7683 const ARMSubtarget *ST) {
7684 SDLoc dl(Op);
7685 EVT VT = Op.getValueType();
7686
7687 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7688
7689 unsigned NumElts = VT.getVectorNumElements();
7690 unsigned BoolMask;
7691 unsigned BitsPerBool;
7692 if (NumElts == 2) {
7693 BitsPerBool = 8;
7694 BoolMask = 0xff;
7695 } else if (NumElts == 4) {
7696 BitsPerBool = 4;
7697 BoolMask = 0xf;
7698 } else if (NumElts == 8) {
7699 BitsPerBool = 2;
7700 BoolMask = 0x3;
7701 } else if (NumElts == 16) {
7702 BitsPerBool = 1;
7703 BoolMask = 0x1;
7704 } else
7705 return SDValue();
7706
7707 // If this is a single value copied into all lanes (a splat), we can just sign
7708 // extend that single value
7709 SDValue FirstOp = Op.getOperand(0);
7710 if (!isa<ConstantSDNode>(FirstOp) &&
7711 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7712 return U.get().isUndef() || U.get() == FirstOp;
7713 })) {
7714 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7715 DAG.getValueType(MVT::i1));
7716 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7717 }
7718
7719 // First create base with bits set where known
7720 unsigned Bits32 = 0;
7721 for (unsigned i = 0; i < NumElts; ++i) {
7722 SDValue V = Op.getOperand(i);
7723 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7724 continue;
7725 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7726 if (BitSet)
7727 Bits32 |= BoolMask << (i * BitsPerBool);
7728 }
7729
7730 // Add in unknown nodes
7732 DAG.getConstant(Bits32, dl, MVT::i32));
7733 for (unsigned i = 0; i < NumElts; ++i) {
7734 SDValue V = Op.getOperand(i);
7735 if (isa<ConstantSDNode>(V) || V.isUndef())
7736 continue;
7737 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7738 DAG.getConstant(i, dl, MVT::i32));
7739 }
7740
7741 return Base;
7742}
7743
7745 const ARMSubtarget *ST) {
7746 if (!ST->hasMVEIntegerOps())
7747 return SDValue();
7748
7749 // We are looking for a buildvector where each element is Op[0] + i*N
7750 EVT VT = Op.getValueType();
7751 SDValue Op0 = Op.getOperand(0);
7752 unsigned NumElts = VT.getVectorNumElements();
7753
7754 // Get the increment value from operand 1
7755 SDValue Op1 = Op.getOperand(1);
7756 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7758 return SDValue();
7759 unsigned N = Op1.getConstantOperandVal(1);
7760 if (N != 1 && N != 2 && N != 4 && N != 8)
7761 return SDValue();
7762
7763 // Check that each other operand matches
7764 for (unsigned I = 2; I < NumElts; I++) {
7765 SDValue OpI = Op.getOperand(I);
7766 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7768 OpI.getConstantOperandVal(1) != I * N)
7769 return SDValue();
7770 }
7771
7772 SDLoc DL(Op);
7773 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7774 DAG.getConstant(N, DL, MVT::i32));
7775}
7776
7777// Returns true if the operation N can be treated as qr instruction variant at
7778// operand Op.
7779static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7780 switch (N->getOpcode()) {
7781 case ISD::ADD:
7782 case ISD::MUL:
7783 case ISD::SADDSAT:
7784 case ISD::UADDSAT:
7785 case ISD::AVGFLOORS:
7786 case ISD::AVGFLOORU:
7787 return true;
7788 case ISD::SUB:
7789 case ISD::SSUBSAT:
7790 case ISD::USUBSAT:
7791 return N->getOperand(1).getNode() == Op;
7793 switch (N->getConstantOperandVal(0)) {
7794 case Intrinsic::arm_mve_add_predicated:
7795 case Intrinsic::arm_mve_mul_predicated:
7796 case Intrinsic::arm_mve_qadd_predicated:
7797 case Intrinsic::arm_mve_vhadd:
7798 case Intrinsic::arm_mve_hadd_predicated:
7799 case Intrinsic::arm_mve_vqdmulh:
7800 case Intrinsic::arm_mve_qdmulh_predicated:
7801 case Intrinsic::arm_mve_vqrdmulh:
7802 case Intrinsic::arm_mve_qrdmulh_predicated:
7803 case Intrinsic::arm_mve_vqdmull:
7804 case Intrinsic::arm_mve_vqdmull_predicated:
7805 return true;
7806 case Intrinsic::arm_mve_sub_predicated:
7807 case Intrinsic::arm_mve_qsub_predicated:
7808 case Intrinsic::arm_mve_vhsub:
7809 case Intrinsic::arm_mve_hsub_predicated:
7810 return N->getOperand(2).getNode() == Op;
7811 default:
7812 return false;
7813 }
7814 default:
7815 return false;
7816 }
7817}
7818
7819// If this is a case we can't handle, return null and let the default
7820// expansion code take care of it.
7821SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7822 const ARMSubtarget *ST) const {
7823 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7824 SDLoc dl(Op);
7825 EVT VT = Op.getValueType();
7826
7827 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7828 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7829
7830 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7831 return R;
7832
7833 APInt SplatBits, SplatUndef;
7834 unsigned SplatBitSize;
7835 bool HasAnyUndefs;
7836 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7837 if (SplatUndef.isAllOnes())
7838 return DAG.getUNDEF(VT);
7839
7840 // If all the users of this constant splat are qr instruction variants,
7841 // generate a vdup of the constant.
7842 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7843 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7844 all_of(BVN->users(),
7845 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7846 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7847 : SplatBitSize == 16 ? MVT::v8i16
7848 : MVT::v16i8;
7849 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7850 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7851 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7852 }
7853
7854 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7855 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7856 // Check if an immediate VMOV works.
7857 EVT VmovVT;
7858 SDValue Val =
7859 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7860 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7861
7862 if (Val.getNode()) {
7863 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7864 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7865 }
7866
7867 // Try an immediate VMVN.
7868 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7869 Val = isVMOVModifiedImm(
7870 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7871 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7872 if (Val.getNode()) {
7873 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7874 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7875 }
7876
7877 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7878 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7879 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7880 if (ImmVal != -1) {
7881 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7882 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7883 }
7884 }
7885
7886 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7887 // type.
7888 if (ST->hasMVEIntegerOps() &&
7889 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7890 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7891 : SplatBitSize == 16 ? MVT::v8i16
7892 : MVT::v16i8;
7893 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7894 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7895 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7896 }
7897 }
7898 }
7899
7900 // Scan through the operands to see if only one value is used.
7901 //
7902 // As an optimisation, even if more than one value is used it may be more
7903 // profitable to splat with one value then change some lanes.
7904 //
7905 // Heuristically we decide to do this if the vector has a "dominant" value,
7906 // defined as splatted to more than half of the lanes.
7907 unsigned NumElts = VT.getVectorNumElements();
7908 bool isOnlyLowElement = true;
7909 bool usesOnlyOneValue = true;
7910 bool hasDominantValue = false;
7911 bool isConstant = true;
7912
7913 // Map of the number of times a particular SDValue appears in the
7914 // element list.
7915 DenseMap<SDValue, unsigned> ValueCounts;
7916 SDValue Value;
7917 for (unsigned i = 0; i < NumElts; ++i) {
7918 SDValue V = Op.getOperand(i);
7919 if (V.isUndef())
7920 continue;
7921 if (i > 0)
7922 isOnlyLowElement = false;
7924 isConstant = false;
7925
7926 unsigned &Count = ValueCounts[V];
7927
7928 // Is this value dominant? (takes up more than half of the lanes)
7929 if (++Count > (NumElts / 2)) {
7930 hasDominantValue = true;
7931 Value = V;
7932 }
7933 }
7934 if (ValueCounts.size() != 1)
7935 usesOnlyOneValue = false;
7936 if (!Value.getNode() && !ValueCounts.empty())
7937 Value = ValueCounts.begin()->first;
7938
7939 if (ValueCounts.empty())
7940 return DAG.getUNDEF(VT);
7941
7942 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7943 // Keep going if we are hitting this case.
7944 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7945 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7946
7947 unsigned EltSize = VT.getScalarSizeInBits();
7948
7949 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7950 // i32 and try again.
7951 if (hasDominantValue && EltSize <= 32) {
7952 if (!isConstant) {
7953 SDValue N;
7954
7955 // If we are VDUPing a value that comes directly from a vector, that will
7956 // cause an unnecessary move to and from a GPR, where instead we could
7957 // just use VDUPLANE. We can only do this if the lane being extracted
7958 // is at a constant index, as the VDUP from lane instructions only have
7959 // constant-index forms.
7960 ConstantSDNode *constIndex;
7961 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7962 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7963 // We need to create a new undef vector to use for the VDUPLANE if the
7964 // size of the vector from which we get the value is different than the
7965 // size of the vector that we need to create. We will insert the element
7966 // such that the register coalescer will remove unnecessary copies.
7967 if (VT != Value->getOperand(0).getValueType()) {
7968 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7970 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7971 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7972 Value, DAG.getConstant(index, dl, MVT::i32)),
7973 DAG.getConstant(index, dl, MVT::i32));
7974 } else
7975 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7976 Value->getOperand(0), Value->getOperand(1));
7977 } else
7978 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7979
7980 if (!usesOnlyOneValue) {
7981 // The dominant value was splatted as 'N', but we now have to insert
7982 // all differing elements.
7983 for (unsigned I = 0; I < NumElts; ++I) {
7984 if (Op.getOperand(I) == Value)
7985 continue;
7987 Ops.push_back(N);
7988 Ops.push_back(Op.getOperand(I));
7989 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7990 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7991 }
7992 }
7993 return N;
7994 }
7997 MVT FVT = VT.getVectorElementType().getSimpleVT();
7998 assert(FVT == MVT::f32 || FVT == MVT::f16);
7999 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
8000 for (unsigned i = 0; i < NumElts; ++i)
8001 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
8002 Op.getOperand(i)));
8003 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
8004 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
8005 Val = LowerBUILD_VECTOR(Val, DAG, ST);
8006 if (Val.getNode())
8007 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8008 }
8009 if (usesOnlyOneValue) {
8010 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
8011 if (isConstant && Val.getNode())
8012 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
8013 }
8014 }
8015
8016 // If all elements are constants and the case above didn't get hit, fall back
8017 // to the default expansion, which will generate a load from the constant
8018 // pool.
8019 if (isConstant)
8020 return SDValue();
8021
8022 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
8023 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
8024 // length <= 2.
8025 if (NumElts >= 4)
8026 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
8027 return shuffle;
8028
8029 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
8030 // VCVT's
8031 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
8032 return VCVT;
8033 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
8034 return VCVT;
8035
8036 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
8037 // If we haven't found an efficient lowering, try splitting a 128-bit vector
8038 // into two 64-bit vectors; we might discover a better way to lower it.
8039 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
8040 EVT ExtVT = VT.getVectorElementType();
8041 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
8042 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
8043 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
8044 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
8045 SDValue Upper =
8046 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
8047 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
8048 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
8049 if (Lower && Upper)
8050 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
8051 }
8052
8053 // Vectors with 32- or 64-bit elements can be built by directly assigning
8054 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
8055 // will be legalized.
8056 if (EltSize >= 32) {
8057 // Do the expansion with floating-point types, since that is what the VFP
8058 // registers are defined to use, and since i64 is not legal.
8059 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8060 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8062 for (unsigned i = 0; i < NumElts; ++i)
8063 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
8064 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8065 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8066 }
8067
8068 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
8069 // know the default expansion would otherwise fall back on something even
8070 // worse. For a vector with one or two non-undef values, that's
8071 // scalar_to_vector for the elements followed by a shuffle (provided the
8072 // shuffle is valid for the target) and materialization element by element
8073 // on the stack followed by a load for everything else.
8074 if (!isConstant && !usesOnlyOneValue) {
8075 SDValue Vec = DAG.getUNDEF(VT);
8076 for (unsigned i = 0 ; i < NumElts; ++i) {
8077 SDValue V = Op.getOperand(i);
8078 if (V.isUndef())
8079 continue;
8080 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
8081 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
8082 }
8083 return Vec;
8084 }
8085
8086 return SDValue();
8087}
8088
8089// Gather data to see if the operation can be modelled as a
8090// shuffle in combination with VEXTs.
8091SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
8092 SelectionDAG &DAG) const {
8093 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
8094 SDLoc dl(Op);
8095 EVT VT = Op.getValueType();
8096 unsigned NumElts = VT.getVectorNumElements();
8097
8098 struct ShuffleSourceInfo {
8099 SDValue Vec;
8100 unsigned MinElt = std::numeric_limits<unsigned>::max();
8101 unsigned MaxElt = 0;
8102
8103 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
8104 // be compatible with the shuffle we intend to construct. As a result
8105 // ShuffleVec will be some sliding window into the original Vec.
8106 SDValue ShuffleVec;
8107
8108 // Code should guarantee that element i in Vec starts at element "WindowBase
8109 // + i * WindowScale in ShuffleVec".
8110 int WindowBase = 0;
8111 int WindowScale = 1;
8112
8113 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
8114
8115 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
8116 };
8117
8118 // First gather all vectors used as an immediate source for this BUILD_VECTOR
8119 // node.
8121 for (unsigned i = 0; i < NumElts; ++i) {
8122 SDValue V = Op.getOperand(i);
8123 if (V.isUndef())
8124 continue;
8125 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
8126 // A shuffle can only come from building a vector from various
8127 // elements of other vectors.
8128 return SDValue();
8129 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
8130 // Furthermore, shuffles require a constant mask, whereas extractelts
8131 // accept variable indices.
8132 return SDValue();
8133 }
8134
8135 // Add this element source to the list if it's not already there.
8136 SDValue SourceVec = V.getOperand(0);
8137 auto Source = llvm::find(Sources, SourceVec);
8138 if (Source == Sources.end())
8139 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
8140
8141 // Update the minimum and maximum lane number seen.
8142 unsigned EltNo = V.getConstantOperandVal(1);
8143 Source->MinElt = std::min(Source->MinElt, EltNo);
8144 Source->MaxElt = std::max(Source->MaxElt, EltNo);
8145 }
8146
8147 // Currently only do something sane when at most two source vectors
8148 // are involved.
8149 if (Sources.size() > 2)
8150 return SDValue();
8151
8152 // Find out the smallest element size among result and two sources, and use
8153 // it as element size to build the shuffle_vector.
8154 EVT SmallestEltTy = VT.getVectorElementType();
8155 for (auto &Source : Sources) {
8156 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8157 if (SrcEltTy.bitsLT(SmallestEltTy))
8158 SmallestEltTy = SrcEltTy;
8159 }
8160 unsigned ResMultiplier =
8161 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
8162 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
8163 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
8164
8165 // If the source vector is too wide or too narrow, we may nevertheless be able
8166 // to construct a compatible shuffle either by concatenating it with UNDEF or
8167 // extracting a suitable range of elements.
8168 for (auto &Src : Sources) {
8169 EVT SrcVT = Src.ShuffleVec.getValueType();
8170
8171 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8172 uint64_t VTSize = VT.getFixedSizeInBits();
8173 if (SrcVTSize == VTSize)
8174 continue;
8175
8176 // This stage of the search produces a source with the same element type as
8177 // the original, but with a total width matching the BUILD_VECTOR output.
8178 EVT EltVT = SrcVT.getVectorElementType();
8179 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8180 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8181
8182 if (SrcVTSize < VTSize) {
8183 if (2 * SrcVTSize != VTSize)
8184 return SDValue();
8185 // We can pad out the smaller vector for free, so if it's part of a
8186 // shuffle...
8187 Src.ShuffleVec =
8188 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8189 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8190 continue;
8191 }
8192
8193 if (SrcVTSize != 2 * VTSize)
8194 return SDValue();
8195
8196 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8197 // Span too large for a VEXT to cope
8198 return SDValue();
8199 }
8200
8201 if (Src.MinElt >= NumSrcElts) {
8202 // The extraction can just take the second half
8203 Src.ShuffleVec =
8204 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8205 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8206 Src.WindowBase = -NumSrcElts;
8207 } else if (Src.MaxElt < NumSrcElts) {
8208 // The extraction can just take the first half
8209 Src.ShuffleVec =
8210 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8211 DAG.getConstant(0, dl, MVT::i32));
8212 } else {
8213 // An actual VEXT is needed
8214 SDValue VEXTSrc1 =
8215 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8216 DAG.getConstant(0, dl, MVT::i32));
8217 SDValue VEXTSrc2 =
8218 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8219 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8220
8221 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8222 VEXTSrc2,
8223 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8224 Src.WindowBase = -Src.MinElt;
8225 }
8226 }
8227
8228 // Another possible incompatibility occurs from the vector element types. We
8229 // can fix this by bitcasting the source vectors to the same type we intend
8230 // for the shuffle.
8231 for (auto &Src : Sources) {
8232 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8233 if (SrcEltTy == SmallestEltTy)
8234 continue;
8235 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8236 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8237 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8238 Src.WindowBase *= Src.WindowScale;
8239 }
8240
8241 // Final check before we try to actually produce a shuffle.
8242 LLVM_DEBUG({
8243 for (auto Src : Sources)
8244 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
8245 });
8246
8247 // The stars all align, our next step is to produce the mask for the shuffle.
8248 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8249 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8250 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8251 SDValue Entry = Op.getOperand(i);
8252 if (Entry.isUndef())
8253 continue;
8254
8255 auto Src = llvm::find(Sources, Entry.getOperand(0));
8256 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8257
8258 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8259 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8260 // segment.
8261 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8262 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8263 VT.getScalarSizeInBits());
8264 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8265
8266 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8267 // starting at the appropriate offset.
8268 int *LaneMask = &Mask[i * ResMultiplier];
8269
8270 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8271 ExtractBase += NumElts * (Src - Sources.begin());
8272 for (int j = 0; j < LanesDefined; ++j)
8273 LaneMask[j] = ExtractBase + j;
8274 }
8275
8276
8277 // We can't handle more than two sources. This should have already
8278 // been checked before this point.
8279 assert(Sources.size() <= 2 && "Too many sources!");
8280
8281 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8282 for (unsigned i = 0; i < Sources.size(); ++i)
8283 ShuffleOps[i] = Sources[i].ShuffleVec;
8284
8285 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8286 ShuffleOps[1], Mask, DAG);
8287 if (!Shuffle)
8288 return SDValue();
8289 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8290}
8291
8293 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8302 OP_VUZPL, // VUZP, left result
8303 OP_VUZPR, // VUZP, right result
8304 OP_VZIPL, // VZIP, left result
8305 OP_VZIPR, // VZIP, right result
8306 OP_VTRNL, // VTRN, left result
8307 OP_VTRNR // VTRN, right result
8308};
8309
8310static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8311 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8312 switch (OpNum) {
8313 case OP_COPY:
8314 case OP_VREV:
8315 case OP_VDUP0:
8316 case OP_VDUP1:
8317 case OP_VDUP2:
8318 case OP_VDUP3:
8319 return true;
8320 }
8321 return false;
8322}
8323
8324/// isShuffleMaskLegal - Targets can use this to indicate that they only
8325/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8326/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8327/// are assumed to be legal.
8329 if (VT.getVectorNumElements() == 4 &&
8330 (VT.is128BitVector() || VT.is64BitVector())) {
8331 unsigned PFIndexes[4];
8332 for (unsigned i = 0; i != 4; ++i) {
8333 if (M[i] < 0)
8334 PFIndexes[i] = 8;
8335 else
8336 PFIndexes[i] = M[i];
8337 }
8338
8339 // Compute the index in the perfect shuffle table.
8340 unsigned PFTableIndex =
8341 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8342 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8343 unsigned Cost = (PFEntry >> 30);
8344
8345 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8346 return true;
8347 }
8348
8349 bool ReverseVEXT, isV_UNDEF;
8350 unsigned Imm, WhichResult;
8351
8352 unsigned EltSize = VT.getScalarSizeInBits();
8353 if (EltSize >= 32 ||
8355 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8356 isVREVMask(M, VT, 64) ||
8357 isVREVMask(M, VT, 32) ||
8358 isVREVMask(M, VT, 16))
8359 return true;
8360 else if (Subtarget->hasNEON() &&
8361 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8362 isVTBLMask(M, VT) ||
8363 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8364 return true;
8365 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8366 isReverseMask(M, VT))
8367 return true;
8368 else if (Subtarget->hasMVEIntegerOps() &&
8369 (isVMOVNMask(M, VT, true, false) ||
8370 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8371 return true;
8372 else if (Subtarget->hasMVEIntegerOps() &&
8373 (isTruncMask(M, VT, false, false) ||
8374 isTruncMask(M, VT, false, true) ||
8375 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8376 return true;
8377 else
8378 return false;
8379}
8380
8381/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8382/// the specified operations to build the shuffle.
8384 SDValue RHS, SelectionDAG &DAG,
8385 const SDLoc &dl) {
8386 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8387 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8388 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8389
8390 if (OpNum == OP_COPY) {
8391 if (LHSID == (1*9+2)*9+3) return LHS;
8392 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8393 return RHS;
8394 }
8395
8396 SDValue OpLHS, OpRHS;
8397 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8398 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8399 EVT VT = OpLHS.getValueType();
8400
8401 switch (OpNum) {
8402 default: llvm_unreachable("Unknown shuffle opcode!");
8403 case OP_VREV:
8404 // VREV divides the vector in half and swaps within the half.
8405 if (VT.getScalarSizeInBits() == 32)
8406 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8407 // vrev <4 x i16> -> VREV32
8408 if (VT.getScalarSizeInBits() == 16)
8409 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8410 // vrev <4 x i8> -> VREV16
8411 assert(VT.getScalarSizeInBits() == 8);
8412 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8413 case OP_VDUP0:
8414 case OP_VDUP1:
8415 case OP_VDUP2:
8416 case OP_VDUP3:
8417 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8418 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8419 case OP_VEXT1:
8420 case OP_VEXT2:
8421 case OP_VEXT3:
8422 return DAG.getNode(ARMISD::VEXT, dl, VT,
8423 OpLHS, OpRHS,
8424 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8425 case OP_VUZPL:
8426 case OP_VUZPR:
8427 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8428 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8429 case OP_VZIPL:
8430 case OP_VZIPR:
8431 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8432 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8433 case OP_VTRNL:
8434 case OP_VTRNR:
8435 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8436 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8437 }
8438}
8439
8441 ArrayRef<int> ShuffleMask,
8442 SelectionDAG &DAG) {
8443 // Check to see if we can use the VTBL instruction.
8444 SDValue V1 = Op.getOperand(0);
8445 SDValue V2 = Op.getOperand(1);
8446 SDLoc DL(Op);
8447
8448 SmallVector<SDValue, 8> VTBLMask;
8449 for (int I : ShuffleMask)
8450 VTBLMask.push_back(DAG.getSignedConstant(I, DL, MVT::i32));
8451
8452 if (V2.getNode()->isUndef())
8453 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8454 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8455
8456 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8457 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8458}
8459
8461 SDLoc DL(Op);
8462 EVT VT = Op.getValueType();
8463
8464 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8465 "Expect an v8i16/v16i8 type");
8466 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8467 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8468 // extract the first 8 bytes into the top double word and the last 8 bytes
8469 // into the bottom double word, through a new vector shuffle that will be
8470 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8471 std::vector<int> NewMask;
8472 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8473 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8474 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8475 NewMask.push_back(i);
8476 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8477}
8478
8480 switch (VT.getSimpleVT().SimpleTy) {
8481 case MVT::v2i1:
8482 return MVT::v2f64;
8483 case MVT::v4i1:
8484 return MVT::v4i32;
8485 case MVT::v8i1:
8486 return MVT::v8i16;
8487 case MVT::v16i1:
8488 return MVT::v16i8;
8489 default:
8490 llvm_unreachable("Unexpected vector predicate type");
8491 }
8492}
8493
8495 SelectionDAG &DAG) {
8496 // Converting from boolean predicates to integers involves creating a vector
8497 // of all ones or all zeroes and selecting the lanes based upon the real
8498 // predicate.
8500 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8501 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8502
8503 SDValue AllZeroes =
8504 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8505 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8506
8507 // Get full vector type from predicate type
8509
8510 SDValue RecastV1;
8511 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8512 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8513 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8514 // since we know in hardware the sizes are really the same.
8515 if (VT != MVT::v16i1)
8516 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8517 else
8518 RecastV1 = Pred;
8519
8520 // Select either all ones or zeroes depending upon the real predicate bits.
8521 SDValue PredAsVector =
8522 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8523
8524 // Recast our new predicate-as-integer v16i8 vector into something
8525 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8526 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8527}
8528
8530 const ARMSubtarget *ST) {
8531 EVT VT = Op.getValueType();
8533 ArrayRef<int> ShuffleMask = SVN->getMask();
8534
8535 assert(ST->hasMVEIntegerOps() &&
8536 "No support for vector shuffle of boolean predicates");
8537
8538 SDValue V1 = Op.getOperand(0);
8539 SDValue V2 = Op.getOperand(1);
8540 SDLoc dl(Op);
8541 if (isReverseMask(ShuffleMask, VT)) {
8542 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8543 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8544 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8545 DAG.getConstant(16, dl, MVT::i32));
8546 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8547 }
8548
8549 // Until we can come up with optimised cases for every single vector
8550 // shuffle in existence we have chosen the least painful strategy. This is
8551 // to essentially promote the boolean predicate to a 8-bit integer, where
8552 // each predicate represents a byte. Then we fall back on a normal integer
8553 // vector shuffle and convert the result back into a predicate vector. In
8554 // many cases the generated code might be even better than scalar code
8555 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8556 // fields in a register into 8 other arbitrary 2-bit fields!
8557 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8558 EVT NewVT = PredAsVector1.getValueType();
8559 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8560 : PromoteMVEPredVector(dl, V2, VT, DAG);
8561 assert(PredAsVector2.getValueType() == NewVT &&
8562 "Expected identical vector type in expanded i1 shuffle!");
8563
8564 // Do the shuffle!
8565 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8566 PredAsVector2, ShuffleMask);
8567
8568 // Now return the result of comparing the shuffled vector with zero,
8569 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8570 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8571 if (VT == MVT::v2i1) {
8572 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8573 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8574 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8575 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8576 }
8577 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8578 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8579}
8580
8582 ArrayRef<int> ShuffleMask,
8583 SelectionDAG &DAG) {
8584 // Attempt to lower the vector shuffle using as many whole register movs as
8585 // possible. This is useful for types smaller than 32bits, which would
8586 // often otherwise become a series for grp movs.
8587 SDLoc dl(Op);
8588 EVT VT = Op.getValueType();
8589 if (VT.getScalarSizeInBits() >= 32)
8590 return SDValue();
8591
8592 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8593 "Unexpected vector type");
8594 int NumElts = VT.getVectorNumElements();
8595 int QuarterSize = NumElts / 4;
8596 // The four final parts of the vector, as i32's
8597 SDValue Parts[4];
8598
8599 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8600 // <u,u,u,u>), returning the vmov lane index
8601 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8602 // Detect which mov lane this would be from the first non-undef element.
8603 int MovIdx = -1;
8604 for (int i = 0; i < Length; i++) {
8605 if (ShuffleMask[Start + i] >= 0) {
8606 if (ShuffleMask[Start + i] % Length != i)
8607 return -1;
8608 MovIdx = ShuffleMask[Start + i] / Length;
8609 break;
8610 }
8611 }
8612 // If all items are undef, leave this for other combines
8613 if (MovIdx == -1)
8614 return -1;
8615 // Check the remaining values are the correct part of the same mov
8616 for (int i = 1; i < Length; i++) {
8617 if (ShuffleMask[Start + i] >= 0 &&
8618 (ShuffleMask[Start + i] / Length != MovIdx ||
8619 ShuffleMask[Start + i] % Length != i))
8620 return -1;
8621 }
8622 return MovIdx;
8623 };
8624
8625 for (int Part = 0; Part < 4; ++Part) {
8626 // Does this part look like a mov
8627 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8628 if (Elt != -1) {
8629 SDValue Input = Op->getOperand(0);
8630 if (Elt >= 4) {
8631 Input = Op->getOperand(1);
8632 Elt -= 4;
8633 }
8634 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8635 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8636 DAG.getConstant(Elt, dl, MVT::i32));
8637 }
8638 }
8639
8640 // Nothing interesting found, just return
8641 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8642 return SDValue();
8643
8644 // The other parts need to be built with the old shuffle vector, cast to a
8645 // v4i32 and extract_vector_elts
8646 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8647 SmallVector<int, 16> NewShuffleMask;
8648 for (int Part = 0; Part < 4; ++Part)
8649 for (int i = 0; i < QuarterSize; i++)
8650 NewShuffleMask.push_back(
8651 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8652 SDValue NewShuffle = DAG.getVectorShuffle(
8653 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8654 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8655
8656 for (int Part = 0; Part < 4; ++Part)
8657 if (!Parts[Part])
8658 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8659 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8660 }
8661 // Build a vector out of the various parts and bitcast it back to the original
8662 // type.
8663 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8664 return DAG.getBitcast(VT, NewVec);
8665}
8666
8668 ArrayRef<int> ShuffleMask,
8669 SelectionDAG &DAG) {
8670 SDValue V1 = Op.getOperand(0);
8671 SDValue V2 = Op.getOperand(1);
8672 EVT VT = Op.getValueType();
8673 unsigned NumElts = VT.getVectorNumElements();
8674
8675 // An One-Off Identity mask is one that is mostly an identity mask from as
8676 // single source but contains a single element out-of-place, either from a
8677 // different vector or from another position in the same vector. As opposed to
8678 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8679 // pair directly.
8680 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8681 int &OffElement) {
8682 OffElement = -1;
8683 int NonUndef = 0;
8684 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8685 if (Mask[i] == -1)
8686 continue;
8687 NonUndef++;
8688 if (Mask[i] != i + BaseOffset) {
8689 if (OffElement == -1)
8690 OffElement = i;
8691 else
8692 return false;
8693 }
8694 }
8695 return NonUndef > 2 && OffElement != -1;
8696 };
8697 int OffElement;
8698 SDValue VInput;
8699 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8700 VInput = V1;
8701 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8702 VInput = V2;
8703 else
8704 return SDValue();
8705
8706 SDLoc dl(Op);
8707 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8708 ? MVT::i32
8709 : VT.getScalarType();
8710 SDValue Elt = DAG.getNode(
8711 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8712 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8713 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8714 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8715 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8716}
8717
8719 const ARMSubtarget *ST) {
8720 SDValue V1 = Op.getOperand(0);
8721 SDValue V2 = Op.getOperand(1);
8722 SDLoc dl(Op);
8723 EVT VT = Op.getValueType();
8725 unsigned EltSize = VT.getScalarSizeInBits();
8726
8727 if (ST->hasMVEIntegerOps() && EltSize == 1)
8728 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8729
8730 // Convert shuffles that are directly supported on NEON to target-specific
8731 // DAG nodes, instead of keeping them as shuffles and matching them again
8732 // during code selection. This is more efficient and avoids the possibility
8733 // of inconsistencies between legalization and selection.
8734 // FIXME: floating-point vectors should be canonicalized to integer vectors
8735 // of the same time so that they get CSEd properly.
8736 ArrayRef<int> ShuffleMask = SVN->getMask();
8737
8738 if (EltSize <= 32) {
8739 if (SVN->isSplat()) {
8740 int Lane = SVN->getSplatIndex();
8741 // If this is undef splat, generate it via "just" vdup, if possible.
8742 if (Lane == -1) Lane = 0;
8743
8744 // Test if V1 is a SCALAR_TO_VECTOR.
8745 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8746 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8747 }
8748 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8749 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8750 // reaches it).
8751 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8753 bool IsScalarToVector = true;
8754 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8755 if (!V1.getOperand(i).isUndef()) {
8756 IsScalarToVector = false;
8757 break;
8758 }
8759 if (IsScalarToVector)
8760 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8761 }
8762 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8763 DAG.getConstant(Lane, dl, MVT::i32));
8764 }
8765
8766 bool ReverseVEXT = false;
8767 unsigned Imm = 0;
8768 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8769 if (ReverseVEXT)
8770 std::swap(V1, V2);
8771 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8772 DAG.getConstant(Imm, dl, MVT::i32));
8773 }
8774
8775 if (isVREVMask(ShuffleMask, VT, 64))
8776 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8777 if (isVREVMask(ShuffleMask, VT, 32))
8778 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8779 if (isVREVMask(ShuffleMask, VT, 16))
8780 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8781
8782 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8783 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8784 DAG.getConstant(Imm, dl, MVT::i32));
8785 }
8786
8787 // Check for Neon shuffles that modify both input vectors in place.
8788 // If both results are used, i.e., if there are two shuffles with the same
8789 // source operands and with masks corresponding to both results of one of
8790 // these operations, DAG memoization will ensure that a single node is
8791 // used for both shuffles.
8792 unsigned WhichResult = 0;
8793 bool isV_UNDEF = false;
8794 if (ST->hasNEON()) {
8795 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8796 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8797 if (isV_UNDEF)
8798 V2 = V1;
8799 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8800 .getValue(WhichResult);
8801 }
8802 }
8803 if (ST->hasMVEIntegerOps()) {
8804 if (isVMOVNMask(ShuffleMask, VT, false, false))
8805 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8806 DAG.getConstant(0, dl, MVT::i32));
8807 if (isVMOVNMask(ShuffleMask, VT, true, false))
8808 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8809 DAG.getConstant(1, dl, MVT::i32));
8810 if (isVMOVNMask(ShuffleMask, VT, true, true))
8811 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8812 DAG.getConstant(1, dl, MVT::i32));
8813 }
8814
8815 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8816 // shuffles that produce a result larger than their operands with:
8817 // shuffle(concat(v1, undef), concat(v2, undef))
8818 // ->
8819 // shuffle(concat(v1, v2), undef)
8820 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8821 //
8822 // This is useful in the general case, but there are special cases where
8823 // native shuffles produce larger results: the two-result ops.
8824 //
8825 // Look through the concat when lowering them:
8826 // shuffle(concat(v1, v2), undef)
8827 // ->
8828 // concat(VZIP(v1, v2):0, :1)
8829 //
8830 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8831 SDValue SubV1 = V1->getOperand(0);
8832 SDValue SubV2 = V1->getOperand(1);
8833 EVT SubVT = SubV1.getValueType();
8834
8835 // We expect these to have been canonicalized to -1.
8836 assert(llvm::all_of(ShuffleMask, [&](int i) {
8837 return i < (int)VT.getVectorNumElements();
8838 }) && "Unexpected shuffle index into UNDEF operand!");
8839
8840 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8841 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8842 if (isV_UNDEF)
8843 SubV2 = SubV1;
8844 assert((WhichResult == 0) &&
8845 "In-place shuffle of concat can only have one result!");
8846 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8847 SubV1, SubV2);
8848 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8849 Res.getValue(1));
8850 }
8851 }
8852 }
8853
8854 if (ST->hasMVEIntegerOps() && EltSize <= 32) {
8855 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8856 return V;
8857
8858 for (bool Top : {false, true}) {
8859 for (bool SingleSource : {false, true}) {
8860 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8861 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8862 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8863 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8864 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8865 SingleSource ? V1 : V2);
8866 if (Top) {
8867 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8868 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8869 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8870 }
8871 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8872 }
8873 }
8874 }
8875 }
8876
8877 // If the shuffle is not directly supported and it has 4 elements, use
8878 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8879 unsigned NumElts = VT.getVectorNumElements();
8880 if (NumElts == 4) {
8881 unsigned PFIndexes[4];
8882 for (unsigned i = 0; i != 4; ++i) {
8883 if (ShuffleMask[i] < 0)
8884 PFIndexes[i] = 8;
8885 else
8886 PFIndexes[i] = ShuffleMask[i];
8887 }
8888
8889 // Compute the index in the perfect shuffle table.
8890 unsigned PFTableIndex =
8891 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8892 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8893 unsigned Cost = (PFEntry >> 30);
8894
8895 if (Cost <= 4) {
8896 if (ST->hasNEON())
8897 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8898 else if (isLegalMVEShuffleOp(PFEntry)) {
8899 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8900 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8901 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8902 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8903 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8904 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8905 }
8906 }
8907 }
8908
8909 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8910 if (EltSize >= 32) {
8911 // Do the expansion with floating-point types, since that is what the VFP
8912 // registers are defined to use, and since i64 is not legal.
8913 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8914 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8915 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8916 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8918 for (unsigned i = 0; i < NumElts; ++i) {
8919 if (ShuffleMask[i] < 0)
8920 Ops.push_back(DAG.getUNDEF(EltVT));
8921 else
8922 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8923 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8924 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8925 dl, MVT::i32)));
8926 }
8927 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8928 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8929 }
8930
8931 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8932 isReverseMask(ShuffleMask, VT))
8933 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
8934
8935 if (ST->hasNEON() && VT == MVT::v8i8)
8936 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8937 return NewOp;
8938
8939 if (ST->hasMVEIntegerOps())
8940 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8941 return NewOp;
8942
8943 return SDValue();
8944}
8945
8947 const ARMSubtarget *ST) {
8948 EVT VecVT = Op.getOperand(0).getValueType();
8949 SDLoc dl(Op);
8950
8951 assert(ST->hasMVEIntegerOps() &&
8952 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8953
8954 SDValue Conv =
8955 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8956 unsigned Lane = Op.getConstantOperandVal(2);
8957 unsigned LaneWidth =
8959 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8960 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8961 Op.getOperand(1), DAG.getValueType(MVT::i1));
8962 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8963 DAG.getConstant(~Mask, dl, MVT::i32));
8964 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8965}
8966
8967SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8968 SelectionDAG &DAG) const {
8969 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8970 SDValue Lane = Op.getOperand(2);
8971 if (!isa<ConstantSDNode>(Lane))
8972 return SDValue();
8973
8974 SDValue Elt = Op.getOperand(1);
8975 EVT EltVT = Elt.getValueType();
8976
8977 if (Subtarget->hasMVEIntegerOps() &&
8978 Op.getValueType().getScalarSizeInBits() == 1)
8979 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8980
8981 if (getTypeAction(*DAG.getContext(), EltVT) ==
8983 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8984 // but the type system will try to do that if we don't intervene.
8985 // Reinterpret any such vector-element insertion as one with the
8986 // corresponding integer types.
8987
8988 SDLoc dl(Op);
8989
8990 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8991 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
8993
8994 SDValue VecIn = Op.getOperand(0);
8995 EVT VecVT = VecIn.getValueType();
8996 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
8997 VecVT.getVectorNumElements());
8998
8999 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
9000 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
9001 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
9002 IVecIn, IElt, Lane);
9003 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
9004 }
9005
9006 return Op;
9007}
9008
9010 const ARMSubtarget *ST) {
9011 EVT VecVT = Op.getOperand(0).getValueType();
9012 SDLoc dl(Op);
9013
9014 assert(ST->hasMVEIntegerOps() &&
9015 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
9016
9017 SDValue Conv =
9018 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
9019 unsigned Lane = Op.getConstantOperandVal(1);
9020 unsigned LaneWidth =
9022 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
9023 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
9024 return Shift;
9025}
9026
9028 const ARMSubtarget *ST) {
9029 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
9030 SDValue Lane = Op.getOperand(1);
9031 if (!isa<ConstantSDNode>(Lane))
9032 return SDValue();
9033
9034 SDValue Vec = Op.getOperand(0);
9035 EVT VT = Vec.getValueType();
9036
9037 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9038 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
9039
9040 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
9041 SDLoc dl(Op);
9042 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
9043 }
9044
9045 return Op;
9046}
9047
9049 const ARMSubtarget *ST) {
9050 SDLoc dl(Op);
9051 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
9052 "Unexpected custom CONCAT_VECTORS lowering");
9053 assert(isPowerOf2_32(Op.getNumOperands()) &&
9054 "Unexpected custom CONCAT_VECTORS lowering");
9055 assert(ST->hasMVEIntegerOps() &&
9056 "CONCAT_VECTORS lowering only supported for MVE");
9057
9058 auto ConcatPair = [&](SDValue V1, SDValue V2) {
9059 EVT Op1VT = V1.getValueType();
9060 EVT Op2VT = V2.getValueType();
9061 assert(Op1VT == Op2VT && "Operand types don't match!");
9062 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
9063 "Unexpected i1 concat operations!");
9064 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
9065
9066 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9067 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
9068
9069 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
9070 // promoted to v8i16, etc.
9071 MVT ElType =
9073 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
9074
9075 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
9076 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
9077 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
9078 // ConcatVT.
9079 SDValue ConVec =
9080 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
9081 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9082 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9083 }
9084
9085 // Extract the vector elements from Op1 and Op2 one by one and truncate them
9086 // to be the right size for the destination. For example, if Op1 is v4i1
9087 // then the promoted vector is v4i32. The result of concatenation gives a
9088 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
9089 // needs truncating to i16 and inserting in the result.
9090 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
9091 EVT NewVT = NewV.getValueType();
9092 EVT ConcatVT = ConVec.getValueType();
9093 unsigned ExtScale = 1;
9094 if (NewVT == MVT::v2f64) {
9095 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
9096 ExtScale = 2;
9097 }
9098 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
9099 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
9100 DAG.getIntPtrConstant(i * ExtScale, dl));
9101 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
9102 DAG.getConstant(j, dl, MVT::i32));
9103 }
9104 return ConVec;
9105 };
9106 unsigned j = 0;
9107 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
9108 ConVec = ExtractInto(NewV1, ConVec, j);
9109 ConVec = ExtractInto(NewV2, ConVec, j);
9110
9111 // Now return the result of comparing the subvector with zero, which will
9112 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9113 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9114 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9115 };
9116
9117 // Concat each pair of subvectors and pack into the lower half of the array.
9118 SmallVector<SDValue> ConcatOps(Op->ops());
9119 while (ConcatOps.size() > 1) {
9120 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
9121 SDValue V1 = ConcatOps[I];
9122 SDValue V2 = ConcatOps[I + 1];
9123 ConcatOps[I / 2] = ConcatPair(V1, V2);
9124 }
9125 ConcatOps.resize(ConcatOps.size() / 2);
9126 }
9127 return ConcatOps[0];
9128}
9129
9131 const ARMSubtarget *ST) {
9132 EVT VT = Op->getValueType(0);
9133 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9134 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
9135
9136 // The only time a CONCAT_VECTORS operation can have legal types is when
9137 // two 64-bit vectors are concatenated to a 128-bit vector.
9138 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
9139 "unexpected CONCAT_VECTORS");
9140 SDLoc dl(Op);
9141 SDValue Val = DAG.getUNDEF(MVT::v2f64);
9142 SDValue Op0 = Op.getOperand(0);
9143 SDValue Op1 = Op.getOperand(1);
9144 if (!Op0.isUndef())
9145 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9146 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
9147 DAG.getIntPtrConstant(0, dl));
9148 if (!Op1.isUndef())
9149 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9150 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
9151 DAG.getIntPtrConstant(1, dl));
9152 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
9153}
9154
9156 const ARMSubtarget *ST) {
9157 SDValue V1 = Op.getOperand(0);
9158 SDValue V2 = Op.getOperand(1);
9159 SDLoc dl(Op);
9160 EVT VT = Op.getValueType();
9161 EVT Op1VT = V1.getValueType();
9162 unsigned NumElts = VT.getVectorNumElements();
9163 unsigned Index = V2->getAsZExtVal();
9164
9165 assert(VT.getScalarSizeInBits() == 1 &&
9166 "Unexpected custom EXTRACT_SUBVECTOR lowering");
9167 assert(ST->hasMVEIntegerOps() &&
9168 "EXTRACT_SUBVECTOR lowering only supported for MVE");
9169
9170 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9171
9172 // We now have Op1 promoted to a vector of integers, where v8i1 gets
9173 // promoted to v8i16, etc.
9174
9176
9177 if (NumElts == 2) {
9178 EVT SubVT = MVT::v4i32;
9179 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9180 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
9181 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9182 DAG.getIntPtrConstant(i, dl));
9183 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9184 DAG.getConstant(j, dl, MVT::i32));
9185 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9186 DAG.getConstant(j + 1, dl, MVT::i32));
9187 }
9188 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
9189 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9190 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
9191 }
9192
9193 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
9194 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9195 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
9196 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9197 DAG.getIntPtrConstant(i, dl));
9198 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9199 DAG.getConstant(j, dl, MVT::i32));
9200 }
9201
9202 // Now return the result of comparing the subvector with zero,
9203 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9204 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
9205 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9206}
9207
9208// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
9210 const ARMSubtarget *ST) {
9211 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
9212 EVT VT = N->getValueType(0);
9213 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9214 "Expected a vector i1 type!");
9215 SDValue Op = N->getOperand(0);
9216 EVT FromVT = Op.getValueType();
9217 SDLoc DL(N);
9218
9219 SDValue And =
9220 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
9221 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
9222 DAG.getCondCode(ISD::SETNE));
9223}
9224
9226 const ARMSubtarget *Subtarget) {
9227 if (!Subtarget->hasMVEIntegerOps())
9228 return SDValue();
9229
9230 EVT ToVT = N->getValueType(0);
9231 if (ToVT.getScalarType() == MVT::i1)
9232 return LowerTruncatei1(N, DAG, Subtarget);
9233
9234 // MVE does not have a single instruction to perform the truncation of a v4i32
9235 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9236 // Most of the instructions in MVE follow the 'Beats' system, where moving
9237 // values from different lanes is usually something that the instructions
9238 // avoid.
9239 //
9240 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9241 // which take a the top/bottom half of a larger lane and extend it (or do the
9242 // opposite, truncating into the top/bottom lane from a larger lane). Note
9243 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9244 // bottom 16bits from each vector lane. This works really well with T/B
9245 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9246 // to move order.
9247 //
9248 // But truncates and sext/zext are always going to be fairly common from llvm.
9249 // We have several options for how to deal with them:
9250 // - Wherever possible combine them into an instruction that makes them
9251 // "free". This includes loads/stores, which can perform the trunc as part
9252 // of the memory operation. Or certain shuffles that can be turned into
9253 // VMOVN/VMOVL.
9254 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9255 // trunc(mul(sext(a), sext(b))) may become
9256 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9257 // this case can use VMULL). This is performed in the
9258 // MVELaneInterleavingPass.
9259 // - Otherwise we have an option. By default we would expand the
9260 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9261 // registers. One for each vector lane in the vector. This can obviously be
9262 // very expensive.
9263 // - The other option is to use the fact that loads/store can extend/truncate
9264 // to turn a trunc into two truncating stack stores and a stack reload. This
9265 // becomes 3 back-to-back memory operations, but at least that is less than
9266 // all the insert/extracts.
9267 //
9268 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9269 // are either optimized where they can be, or eventually lowered into stack
9270 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9271 // two early, where other instructions would be better, and stops us from
9272 // having to reconstruct multiple buildvector shuffles into loads/stores.
9273 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9274 return SDValue();
9275 EVT FromVT = N->getOperand(0).getValueType();
9276 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9277 return SDValue();
9278
9279 SDValue Lo, Hi;
9280 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9281 SDLoc DL(N);
9282 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9283}
9284
9286 const ARMSubtarget *Subtarget) {
9287 if (!Subtarget->hasMVEIntegerOps())
9288 return SDValue();
9289
9290 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9291
9292 EVT ToVT = N->getValueType(0);
9293 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9294 return SDValue();
9295 SDValue Op = N->getOperand(0);
9296 EVT FromVT = Op.getValueType();
9297 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9298 return SDValue();
9299
9300 SDLoc DL(N);
9301 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9302 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9303 ExtVT = MVT::v8i16;
9304
9305 unsigned Opcode =
9307 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9308 SDValue Ext1 = Ext.getValue(1);
9309
9310 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9311 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9312 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9313 }
9314
9315 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9316}
9317
9318/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9319/// element has been zero/sign-extended, depending on the isSigned parameter,
9320/// from an integer type half its size.
9322 bool isSigned) {
9323 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9324 EVT VT = N->getValueType(0);
9325 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9326 SDNode *BVN = N->getOperand(0).getNode();
9327 if (BVN->getValueType(0) != MVT::v4i32 ||
9328 BVN->getOpcode() != ISD::BUILD_VECTOR)
9329 return false;
9330 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9331 unsigned HiElt = 1 - LoElt;
9336 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9337 return false;
9338 if (isSigned) {
9339 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9340 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9341 return true;
9342 } else {
9343 if (Hi0->isZero() && Hi1->isZero())
9344 return true;
9345 }
9346 return false;
9347 }
9348
9349 if (N->getOpcode() != ISD::BUILD_VECTOR)
9350 return false;
9351
9352 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9353 SDNode *Elt = N->getOperand(i).getNode();
9355 unsigned EltSize = VT.getScalarSizeInBits();
9356 unsigned HalfSize = EltSize / 2;
9357 if (isSigned) {
9358 if (!isIntN(HalfSize, C->getSExtValue()))
9359 return false;
9360 } else {
9361 if (!isUIntN(HalfSize, C->getZExtValue()))
9362 return false;
9363 }
9364 continue;
9365 }
9366 return false;
9367 }
9368
9369 return true;
9370}
9371
9372/// isSignExtended - Check if a node is a vector value that is sign-extended
9373/// or a constant BUILD_VECTOR with sign-extended elements.
9375 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9376 return true;
9377 if (isExtendedBUILD_VECTOR(N, DAG, true))
9378 return true;
9379 return false;
9380}
9381
9382/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9383/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9385 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9387 return true;
9388 if (isExtendedBUILD_VECTOR(N, DAG, false))
9389 return true;
9390 return false;
9391}
9392
9393static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9394 if (OrigVT.getSizeInBits() >= 64)
9395 return OrigVT;
9396
9397 assert(OrigVT.isSimple() && "Expecting a simple value type");
9398
9399 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9400 switch (OrigSimpleTy) {
9401 default: llvm_unreachable("Unexpected Vector Type");
9402 case MVT::v2i8:
9403 case MVT::v2i16:
9404 return MVT::v2i32;
9405 case MVT::v4i8:
9406 return MVT::v4i16;
9407 }
9408}
9409
9410/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9411/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9412/// We insert the required extension here to get the vector to fill a D register.
9414 const EVT &OrigTy,
9415 const EVT &ExtTy,
9416 unsigned ExtOpcode) {
9417 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9418 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9419 // 64-bits we need to insert a new extension so that it will be 64-bits.
9420 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9421 if (OrigTy.getSizeInBits() >= 64)
9422 return N;
9423
9424 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9425 EVT NewVT = getExtensionTo64Bits(OrigTy);
9426
9427 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9428}
9429
9430/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9431/// does not do any sign/zero extension. If the original vector is less
9432/// than 64 bits, an appropriate extension will be added after the load to
9433/// reach a total size of 64 bits. We have to add the extension separately
9434/// because ARM does not have a sign/zero extending load for vectors.
9436 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9437
9438 // The load already has the right type.
9439 if (ExtendedTy == LD->getMemoryVT())
9440 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9441 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9442 LD->getMemOperand()->getFlags());
9443
9444 // We need to create a zextload/sextload. We cannot just create a load
9445 // followed by a zext/zext node because LowerMUL is also run during normal
9446 // operation legalization where we can't create illegal types.
9447 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9448 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9449 LD->getMemoryVT(), LD->getAlign(),
9450 LD->getMemOperand()->getFlags());
9451}
9452
9453/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9454/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9455/// the unextended value. The unextended vector should be 64 bits so that it can
9456/// be used as an operand to a VMULL instruction. If the original vector size
9457/// before extension is less than 64 bits we add a an extension to resize
9458/// the vector to 64 bits.
9460 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9461 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9462 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9463 N->getOperand(0)->getValueType(0),
9464 N->getValueType(0),
9465 N->getOpcode());
9466
9467 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9468 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9469 "Expected extending load");
9470
9471 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9472 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9473 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9474 SDValue extLoad =
9475 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9476 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9477
9478 return newLoad;
9479 }
9480
9481 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9482 // have been legalized as a BITCAST from v4i32.
9483 if (N->getOpcode() == ISD::BITCAST) {
9484 SDNode *BVN = N->getOperand(0).getNode();
9486 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9487 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9488 return DAG.getBuildVector(
9489 MVT::v2i32, SDLoc(N),
9490 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9491 }
9492 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9493 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9494 EVT VT = N->getValueType(0);
9495 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9496 unsigned NumElts = VT.getVectorNumElements();
9497 MVT TruncVT = MVT::getIntegerVT(EltSize);
9499 SDLoc dl(N);
9500 for (unsigned i = 0; i != NumElts; ++i) {
9501 const APInt &CInt = N->getConstantOperandAPInt(i);
9502 // Element types smaller than 32 bits are not legal, so use i32 elements.
9503 // The values are implicitly truncated so sext vs. zext doesn't matter.
9504 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9505 }
9506 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9507}
9508
9509static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9510 unsigned Opcode = N->getOpcode();
9511 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9512 SDNode *N0 = N->getOperand(0).getNode();
9513 SDNode *N1 = N->getOperand(1).getNode();
9514 return N0->hasOneUse() && N1->hasOneUse() &&
9515 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9516 }
9517 return false;
9518}
9519
9520static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9521 unsigned Opcode = N->getOpcode();
9522 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9523 SDNode *N0 = N->getOperand(0).getNode();
9524 SDNode *N1 = N->getOperand(1).getNode();
9525 return N0->hasOneUse() && N1->hasOneUse() &&
9526 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9527 }
9528 return false;
9529}
9530
9532 // Multiplications are only custom-lowered for 128-bit vectors so that
9533 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9534 EVT VT = Op.getValueType();
9535 assert(VT.is128BitVector() && VT.isInteger() &&
9536 "unexpected type for custom-lowering ISD::MUL");
9537 SDNode *N0 = Op.getOperand(0).getNode();
9538 SDNode *N1 = Op.getOperand(1).getNode();
9539 unsigned NewOpc = 0;
9540 bool isMLA = false;
9541 bool isN0SExt = isSignExtended(N0, DAG);
9542 bool isN1SExt = isSignExtended(N1, DAG);
9543 if (isN0SExt && isN1SExt)
9544 NewOpc = ARMISD::VMULLs;
9545 else {
9546 bool isN0ZExt = isZeroExtended(N0, DAG);
9547 bool isN1ZExt = isZeroExtended(N1, DAG);
9548 if (isN0ZExt && isN1ZExt)
9549 NewOpc = ARMISD::VMULLu;
9550 else if (isN1SExt || isN1ZExt) {
9551 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9552 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9553 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9554 NewOpc = ARMISD::VMULLs;
9555 isMLA = true;
9556 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9557 NewOpc = ARMISD::VMULLu;
9558 isMLA = true;
9559 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9560 std::swap(N0, N1);
9561 NewOpc = ARMISD::VMULLu;
9562 isMLA = true;
9563 }
9564 }
9565
9566 if (!NewOpc) {
9567 if (VT == MVT::v2i64)
9568 // Fall through to expand this. It is not legal.
9569 return SDValue();
9570 else
9571 // Other vector multiplications are legal.
9572 return Op;
9573 }
9574 }
9575
9576 // Legalize to a VMULL instruction.
9577 SDLoc DL(Op);
9578 SDValue Op0;
9579 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9580 if (!isMLA) {
9581 Op0 = SkipExtensionForVMULL(N0, DAG);
9583 Op1.getValueType().is64BitVector() &&
9584 "unexpected types for extended operands to VMULL");
9585 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9586 }
9587
9588 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9589 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9590 // vmull q0, d4, d6
9591 // vmlal q0, d5, d6
9592 // is faster than
9593 // vaddl q0, d4, d5
9594 // vmovl q1, d6
9595 // vmul q0, q0, q1
9596 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9597 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9598 EVT Op1VT = Op1.getValueType();
9599 return DAG.getNode(N0->getOpcode(), DL, VT,
9600 DAG.getNode(NewOpc, DL, VT,
9601 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9602 DAG.getNode(NewOpc, DL, VT,
9603 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9604}
9605
9607 SelectionDAG &DAG) {
9608 // TODO: Should this propagate fast-math-flags?
9609
9610 // Convert to float
9611 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9612 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9613 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9614 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9615 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9616 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9617 // Get reciprocal estimate.
9618 // float4 recip = vrecpeq_f32(yf);
9619 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9620 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9621 Y);
9622 // Because char has a smaller range than uchar, we can actually get away
9623 // without any newton steps. This requires that we use a weird bias
9624 // of 0xb000, however (again, this has been exhaustively tested).
9625 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9626 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9627 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9628 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9629 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9630 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9631 // Convert back to short.
9632 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9633 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9634 return X;
9635}
9636
9638 SelectionDAG &DAG) {
9639 // TODO: Should this propagate fast-math-flags?
9640
9641 SDValue N2;
9642 // Convert to float.
9643 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9644 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9645 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9646 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9647 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9648 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9649
9650 // Use reciprocal estimate and one refinement step.
9651 // float4 recip = vrecpeq_f32(yf);
9652 // recip *= vrecpsq_f32(yf, recip);
9653 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9654 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9655 N1);
9656 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9657 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9658 N1, N2);
9659 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9660 // Because short has a smaller range than ushort, we can actually get away
9661 // with only a single newton step. This requires that we use a weird bias
9662 // of 89, however (again, this has been exhaustively tested).
9663 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9664 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9665 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9666 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9667 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9668 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9669 // Convert back to integer and return.
9670 // return vmovn_s32(vcvt_s32_f32(result));
9671 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9672 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9673 return N0;
9674}
9675
9677 const ARMSubtarget *ST) {
9678 EVT VT = Op.getValueType();
9679 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9680 "unexpected type for custom-lowering ISD::SDIV");
9681
9682 SDLoc dl(Op);
9683 SDValue N0 = Op.getOperand(0);
9684 SDValue N1 = Op.getOperand(1);
9685 SDValue N2, N3;
9686
9687 if (VT == MVT::v8i8) {
9688 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9689 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9690
9691 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9692 DAG.getIntPtrConstant(4, dl));
9693 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9694 DAG.getIntPtrConstant(4, dl));
9695 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9696 DAG.getIntPtrConstant(0, dl));
9697 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9698 DAG.getIntPtrConstant(0, dl));
9699
9700 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9701 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9702
9703 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9704 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9705
9706 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9707 return N0;
9708 }
9709 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9710}
9711
9713 const ARMSubtarget *ST) {
9714 // TODO: Should this propagate fast-math-flags?
9715 EVT VT = Op.getValueType();
9716 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9717 "unexpected type for custom-lowering ISD::UDIV");
9718
9719 SDLoc dl(Op);
9720 SDValue N0 = Op.getOperand(0);
9721 SDValue N1 = Op.getOperand(1);
9722 SDValue N2, N3;
9723
9724 if (VT == MVT::v8i8) {
9725 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9726 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9727
9728 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9729 DAG.getIntPtrConstant(4, dl));
9730 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9731 DAG.getIntPtrConstant(4, dl));
9732 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9733 DAG.getIntPtrConstant(0, dl));
9734 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9735 DAG.getIntPtrConstant(0, dl));
9736
9737 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9738 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9739
9740 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9741 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9742
9743 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9744 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9745 MVT::i32),
9746 N0);
9747 return N0;
9748 }
9749
9750 // v4i16 sdiv ... Convert to float.
9751 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9752 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9753 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9754 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9755 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9756 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9757
9758 // Use reciprocal estimate and two refinement steps.
9759 // float4 recip = vrecpeq_f32(yf);
9760 // recip *= vrecpsq_f32(yf, recip);
9761 // recip *= vrecpsq_f32(yf, recip);
9762 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9763 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9764 BN1);
9765 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9766 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9767 BN1, N2);
9768 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9769 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9770 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9771 BN1, N2);
9772 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9773 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9774 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9775 // and that it will never cause us to return an answer too large).
9776 // float4 result = as_float4(as_int4(xf*recip) + 2);
9777 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9778 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9779 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9780 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9781 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9782 // Convert back to integer and return.
9783 // return vmovn_u32(vcvt_s32_f32(result));
9784 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9785 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9786 return N0;
9787}
9788
9790 SDNode *N = Op.getNode();
9791 EVT VT = N->getValueType(0);
9792 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9793
9794 SDValue Carry = Op.getOperand(2);
9795
9796 SDLoc DL(Op);
9797
9798 SDValue Result;
9799 if (Op.getOpcode() == ISD::UADDO_CARRY) {
9800 // This converts the boolean value carry into the carry flag.
9801 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9802
9803 // Do the addition proper using the carry flag we wanted.
9804 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9805 Op.getOperand(1), Carry);
9806
9807 // Now convert the carry flag into a boolean value.
9808 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9809 } else {
9810 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
9811 // have to invert the carry first.
9812 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9813 DAG.getConstant(1, DL, MVT::i32), Carry);
9814 // This converts the boolean value carry into the carry flag.
9815 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9816
9817 // Do the subtraction proper using the carry flag we wanted.
9818 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9819 Op.getOperand(1), Carry);
9820
9821 // Now convert the carry flag into a boolean value.
9822 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9823 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9824 // by ISD::USUBO_CARRY, so compute 1 - C.
9825 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9826 DAG.getConstant(1, DL, MVT::i32), Carry);
9827 }
9828
9829 // Return both values.
9830 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9831}
9832
9833SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
9834 assert(Subtarget->isTargetDarwin());
9835
9836 // For iOS, we want to call an alternative entry point: __sincos_stret,
9837 // return values are passed via sret.
9838 SDLoc dl(Op);
9839 SDValue Arg = Op.getOperand(0);
9840 EVT ArgVT = Arg.getValueType();
9841 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9842 auto PtrVT = getPointerTy(DAG.getDataLayout());
9843
9844 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9845
9846 // Pair of floats / doubles used to pass the result.
9847 Type *RetTy = StructType::get(ArgTy, ArgTy);
9848 auto &DL = DAG.getDataLayout();
9849
9851 bool ShouldUseSRet = getTM().isAPCS_ABI();
9852 SDValue SRet;
9853 if (ShouldUseSRet) {
9854 // Create stack object for sret.
9855 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
9856 const Align StackAlign = DL.getPrefTypeAlign(RetTy);
9857 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
9858 SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
9859
9861 Entry.IsSExt = false;
9862 Entry.IsZExt = false;
9863 Entry.IsSRet = true;
9864 Args.push_back(Entry);
9865 RetTy = Type::getVoidTy(*DAG.getContext());
9866 }
9867
9868 Args.emplace_back(Arg, ArgTy);
9869
9870 RTLIB::Libcall LC =
9871 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
9872 const char *LibcallName = getLibcallName(LC);
9874 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
9875
9876 TargetLowering::CallLoweringInfo CLI(DAG);
9877 CLI.setDebugLoc(dl)
9878 .setChain(DAG.getEntryNode())
9879 .setCallee(CC, RetTy, Callee, std::move(Args))
9880 .setDiscardResult(ShouldUseSRet);
9881 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
9882
9883 if (!ShouldUseSRet)
9884 return CallResult.first;
9885
9886 SDValue LoadSin =
9887 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
9888
9889 // Address of cos field.
9890 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
9891 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
9892 SDValue LoadCos =
9893 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
9894
9895 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
9896 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
9897 LoadSin.getValue(0), LoadCos.getValue(0));
9898}
9899
9900SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9901 bool Signed,
9902 SDValue &Chain) const {
9903 EVT VT = Op.getValueType();
9904 assert((VT == MVT::i32 || VT == MVT::i64) &&
9905 "unexpected type for custom lowering DIV");
9906 SDLoc dl(Op);
9907
9908 const auto &DL = DAG.getDataLayout();
9909 RTLIB::Libcall LC;
9910 if (Signed)
9911 LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
9912 else
9913 LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
9914
9915 const char *Name = getLibcallName(LC);
9916 SDValue ES = DAG.getExternalSymbol(Name, getPointerTy(DL));
9917
9919
9920 for (auto AI : {1, 0}) {
9921 SDValue Operand = Op.getOperand(AI);
9922 Args.emplace_back(Operand,
9923 Operand.getValueType().getTypeForEVT(*DAG.getContext()));
9924 }
9925
9926 CallLoweringInfo CLI(DAG);
9927 CLI.setDebugLoc(dl)
9928 .setChain(Chain)
9930 ES, std::move(Args));
9931
9932 return LowerCallTo(CLI).first;
9933}
9934
9935// This is a code size optimisation: return the original SDIV node to
9936// DAGCombiner when we don't want to expand SDIV into a sequence of
9937// instructions, and an empty node otherwise which will cause the
9938// SDIV to be expanded in DAGCombine.
9939SDValue
9940ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9941 SelectionDAG &DAG,
9942 SmallVectorImpl<SDNode *> &Created) const {
9943 // TODO: Support SREM
9944 if (N->getOpcode() != ISD::SDIV)
9945 return SDValue();
9946
9947 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
9948 const bool MinSize = ST.hasMinSize();
9949 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9950 : ST.hasDivideInARMMode();
9951
9952 // Don't touch vector types; rewriting this may lead to scalarizing
9953 // the int divs.
9954 if (N->getOperand(0).getValueType().isVector())
9955 return SDValue();
9956
9957 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9958 // hwdiv support for this to be really profitable.
9959 if (!(MinSize && HasDivide))
9960 return SDValue();
9961
9962 // ARM mode is a bit simpler than Thumb: we can handle large power
9963 // of 2 immediates with 1 mov instruction; no further checks required,
9964 // just return the sdiv node.
9965 if (!ST.isThumb())
9966 return SDValue(N, 0);
9967
9968 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9969 // and thus lose the code size benefits of a MOVS that requires only 2.
9970 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9971 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9972 if (Divisor.sgt(128))
9973 return SDValue();
9974
9975 return SDValue(N, 0);
9976}
9977
9978SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9979 bool Signed) const {
9980 assert(Op.getValueType() == MVT::i32 &&
9981 "unexpected type for custom lowering DIV");
9982 SDLoc dl(Op);
9983
9984 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9985 DAG.getEntryNode(), Op.getOperand(1));
9986
9987 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9988}
9989
9991 SDLoc DL(N);
9992 SDValue Op = N->getOperand(1);
9993 if (N->getValueType(0) == MVT::i32)
9994 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9995 SDValue Lo, Hi;
9996 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
9997 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9998 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9999}
10000
10001void ARMTargetLowering::ExpandDIV_Windows(
10002 SDValue Op, SelectionDAG &DAG, bool Signed,
10004 const auto &DL = DAG.getDataLayout();
10005
10006 assert(Op.getValueType() == MVT::i64 &&
10007 "unexpected type for custom lowering DIV");
10008 SDLoc dl(Op);
10009
10010 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
10011
10012 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
10013
10014 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
10015 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
10016 DAG.getConstant(32, dl, getPointerTy(DL)));
10017 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
10018
10019 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
10020}
10021
10023 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
10024 EVT MemVT = LD->getMemoryVT();
10025 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10026 MemVT == MVT::v16i1) &&
10027 "Expected a predicate type!");
10028 assert(MemVT == Op.getValueType());
10029 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
10030 "Expected a non-extending load");
10031 assert(LD->isUnindexed() && "Expected a unindexed load");
10032
10033 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
10034 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
10035 // need to make sure that 8/4/2 bits are actually loaded into the correct
10036 // place, which means loading the value and then shuffling the values into
10037 // the bottom bits of the predicate.
10038 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
10039 // for BE).
10040 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
10041 // a natural VMSR(load), so needs to be reversed.
10042
10043 SDLoc dl(Op);
10044 SDValue Load = DAG.getExtLoad(
10045 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
10047 LD->getMemOperand());
10048 SDValue Val = Load;
10049 if (DAG.getDataLayout().isBigEndian())
10050 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
10051 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
10052 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
10053 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
10054 if (MemVT != MVT::v16i1)
10055 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
10056 DAG.getConstant(0, dl, MVT::i32));
10057 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
10058}
10059
10060void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
10061 SelectionDAG &DAG) const {
10062 LoadSDNode *LD = cast<LoadSDNode>(N);
10063 EVT MemVT = LD->getMemoryVT();
10064 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
10065
10066 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10067 !Subtarget->isThumb1Only() && LD->isVolatile() &&
10068 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10069 SDLoc dl(N);
10071 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
10072 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
10073 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
10074 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
10075 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
10076 Results.append({Pair, Result.getValue(2)});
10077 }
10078}
10079
10081 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10082 EVT MemVT = ST->getMemoryVT();
10083 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10084 MemVT == MVT::v16i1) &&
10085 "Expected a predicate type!");
10086 assert(MemVT == ST->getValue().getValueType());
10087 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
10088 assert(ST->isUnindexed() && "Expected a unindexed store");
10089
10090 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
10091 // top bits unset and a scalar store.
10092 SDLoc dl(Op);
10093 SDValue Build = ST->getValue();
10094 if (MemVT != MVT::v16i1) {
10096 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
10097 unsigned Elt = DAG.getDataLayout().isBigEndian()
10098 ? MemVT.getVectorNumElements() - I - 1
10099 : I;
10100 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
10101 DAG.getConstant(Elt, dl, MVT::i32)));
10102 }
10103 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
10104 Ops.push_back(DAG.getUNDEF(MVT::i32));
10105 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
10106 }
10107 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
10108 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
10109 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
10110 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
10111 DAG.getConstant(16, dl, MVT::i32));
10112 return DAG.getTruncStore(
10113 ST->getChain(), dl, GRP, ST->getBasePtr(),
10115 ST->getMemOperand());
10116}
10117
10119 const ARMSubtarget *Subtarget) {
10120 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10121 EVT MemVT = ST->getMemoryVT();
10122 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
10123
10124 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10125 !Subtarget->isThumb1Only() && ST->isVolatile() &&
10126 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10127 SDNode *N = Op.getNode();
10128 SDLoc dl(N);
10129
10130 SDValue Lo = DAG.getNode(
10131 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10132 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
10133 MVT::i32));
10134 SDValue Hi = DAG.getNode(
10135 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10136 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
10137 MVT::i32));
10138
10139 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
10140 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
10141 MemVT, ST->getMemOperand());
10142 } else if (Subtarget->hasMVEIntegerOps() &&
10143 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10144 MemVT == MVT::v16i1))) {
10145 return LowerPredicateStore(Op, DAG);
10146 }
10147
10148 return SDValue();
10149}
10150
10151static bool isZeroVector(SDValue N) {
10152 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
10153 (N->getOpcode() == ARMISD::VMOVIMM &&
10154 isNullConstant(N->getOperand(0))));
10155}
10156
10159 MVT VT = Op.getSimpleValueType();
10160 SDValue Mask = N->getMask();
10161 SDValue PassThru = N->getPassThru();
10162 SDLoc dl(Op);
10163
10164 if (isZeroVector(PassThru))
10165 return Op;
10166
10167 // MVE Masked loads use zero as the passthru value. Here we convert undef to
10168 // zero too, and other values are lowered to a select.
10169 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
10170 DAG.getTargetConstant(0, dl, MVT::i32));
10171 SDValue NewLoad = DAG.getMaskedLoad(
10172 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
10173 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
10174 N->getExtensionType(), N->isExpandingLoad());
10175 SDValue Combo = NewLoad;
10176 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
10177 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
10178 isZeroVector(PassThru->getOperand(0));
10179 if (!PassThru.isUndef() && !PassThruIsCastZero)
10180 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
10181 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
10182}
10183
10185 const ARMSubtarget *ST) {
10186 if (!ST->hasMVEIntegerOps())
10187 return SDValue();
10188
10189 SDLoc dl(Op);
10190 unsigned BaseOpcode = 0;
10191 switch (Op->getOpcode()) {
10192 default: llvm_unreachable("Expected VECREDUCE opcode");
10193 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
10194 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
10195 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
10196 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
10197 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
10198 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
10199 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
10200 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
10201 }
10202
10203 SDValue Op0 = Op->getOperand(0);
10204 EVT VT = Op0.getValueType();
10205 EVT EltVT = VT.getVectorElementType();
10206 unsigned NumElts = VT.getVectorNumElements();
10207 unsigned NumActiveLanes = NumElts;
10208
10209 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10210 NumActiveLanes == 2) &&
10211 "Only expected a power 2 vector size");
10212
10213 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
10214 // allows us to easily extract vector elements from the lanes.
10215 while (NumActiveLanes > 4) {
10216 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
10217 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
10218 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
10219 NumActiveLanes /= 2;
10220 }
10221
10222 SDValue Res;
10223 if (NumActiveLanes == 4) {
10224 // The remaining 4 elements are summed sequentially
10225 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10226 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
10227 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10228 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
10229 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10230 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
10231 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10232 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
10233 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10234 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
10235 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
10236 } else {
10237 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10238 DAG.getConstant(0, dl, MVT::i32));
10239 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10240 DAG.getConstant(1, dl, MVT::i32));
10241 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10242 }
10243
10244 // Result type may be wider than element type.
10245 if (EltVT != Op->getValueType(0))
10246 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
10247 return Res;
10248}
10249
10251 const ARMSubtarget *ST) {
10252 if (!ST->hasMVEFloatOps())
10253 return SDValue();
10254 return LowerVecReduce(Op, DAG, ST);
10255}
10256
10258 const ARMSubtarget *ST) {
10259 if (!ST->hasNEON())
10260 return SDValue();
10261
10262 SDLoc dl(Op);
10263 SDValue Op0 = Op->getOperand(0);
10264 EVT VT = Op0.getValueType();
10265 EVT EltVT = VT.getVectorElementType();
10266
10267 unsigned PairwiseIntrinsic = 0;
10268 switch (Op->getOpcode()) {
10269 default:
10270 llvm_unreachable("Expected VECREDUCE opcode");
10271 case ISD::VECREDUCE_UMIN:
10272 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10273 break;
10274 case ISD::VECREDUCE_UMAX:
10275 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10276 break;
10277 case ISD::VECREDUCE_SMIN:
10278 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10279 break;
10280 case ISD::VECREDUCE_SMAX:
10281 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10282 break;
10283 }
10284 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10285
10286 unsigned NumElts = VT.getVectorNumElements();
10287 unsigned NumActiveLanes = NumElts;
10288
10289 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10290 NumActiveLanes == 2) &&
10291 "Only expected a power 2 vector size");
10292
10293 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10294 if (VT.is128BitVector()) {
10295 SDValue Lo, Hi;
10296 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10297 VT = Lo.getValueType();
10298 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10299 NumActiveLanes /= 2;
10300 }
10301
10302 // Use pairwise reductions until one lane remains
10303 while (NumActiveLanes > 1) {
10304 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10305 NumActiveLanes /= 2;
10306 }
10307
10308 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10309 DAG.getConstant(0, dl, MVT::i32));
10310
10311 // Result type may be wider than element type.
10312 if (EltVT != Op.getValueType()) {
10313 unsigned Extend = 0;
10314 switch (Op->getOpcode()) {
10315 default:
10316 llvm_unreachable("Expected VECREDUCE opcode");
10317 case ISD::VECREDUCE_UMIN:
10318 case ISD::VECREDUCE_UMAX:
10319 Extend = ISD::ZERO_EXTEND;
10320 break;
10321 case ISD::VECREDUCE_SMIN:
10322 case ISD::VECREDUCE_SMAX:
10323 Extend = ISD::SIGN_EXTEND;
10324 break;
10325 }
10326 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10327 }
10328 return Res;
10329}
10330
10332 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10333 // Acquire/Release load/store is not legal for targets without a dmb or
10334 // equivalent available.
10335 return SDValue();
10336
10337 // Monotonic load/store is legal for all targets.
10338 return Op;
10339}
10340
10343 SelectionDAG &DAG,
10344 const ARMSubtarget *Subtarget) {
10345 SDLoc DL(N);
10346 // Under Power Management extensions, the cycle-count is:
10347 // mrc p15, #0, <Rt>, c9, c13, #0
10348 SDValue Ops[] = { N->getOperand(0), // Chain
10349 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10350 DAG.getTargetConstant(15, DL, MVT::i32),
10351 DAG.getTargetConstant(0, DL, MVT::i32),
10352 DAG.getTargetConstant(9, DL, MVT::i32),
10353 DAG.getTargetConstant(13, DL, MVT::i32),
10354 DAG.getTargetConstant(0, DL, MVT::i32)
10355 };
10356
10357 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10358 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10359 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10360 DAG.getConstant(0, DL, MVT::i32)));
10361 Results.push_back(Cycles32.getValue(1));
10362}
10363
10365 SDValue V1) {
10366 SDLoc dl(V0.getNode());
10367 SDValue RegClass =
10368 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10369 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10370 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10371 const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
10372 return SDValue(
10373 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10374}
10375
10377 SDLoc dl(V.getNode());
10378 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10379 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10380 if (isBigEndian)
10381 std::swap(VLo, VHi);
10382 return createGPRPairNode2xi32(DAG, VLo, VHi);
10383}
10384
10387 SelectionDAG &DAG) {
10388 assert(N->getValueType(0) == MVT::i64 &&
10389 "AtomicCmpSwap on types less than 64 should be legal");
10390 SDValue Ops[] = {
10391 createGPRPairNode2xi32(DAG, N->getOperand(1),
10392 DAG.getUNDEF(MVT::i32)), // pointer, temp
10393 createGPRPairNodei64(DAG, N->getOperand(2)), // expected
10394 createGPRPairNodei64(DAG, N->getOperand(3)), // new
10395 N->getOperand(0), // chain in
10396 };
10397 SDNode *CmpSwap = DAG.getMachineNode(
10398 ARM::CMP_SWAP_64, SDLoc(N),
10399 DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops);
10400
10401 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10402 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10403
10404 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10405
10406 SDValue Lo =
10407 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10408 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10409 SDValue Hi =
10410 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10411 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10412 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10413 Results.push_back(SDValue(CmpSwap, 2));
10414}
10415
10416SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10417 SDLoc dl(Op);
10418 EVT VT = Op.getValueType();
10419 SDValue Chain = Op.getOperand(0);
10420 SDValue LHS = Op.getOperand(1);
10421 SDValue RHS = Op.getOperand(2);
10422 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10423 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10424
10425 // If we don't have instructions of this float type then soften to a libcall
10426 // and use SETCC instead.
10427 if (isUnsupportedFloatingType(LHS.getValueType())) {
10428 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
10429 Chain, IsSignaling);
10430 if (!RHS.getNode()) {
10431 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10432 CC = ISD::SETNE;
10433 }
10434 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10435 DAG.getCondCode(CC));
10436 return DAG.getMergeValues({Result, Chain}, dl);
10437 }
10438
10439 ARMCC::CondCodes CondCode, CondCode2;
10440 FPCCToARMCC(CC, CondCode, CondCode2);
10441
10442 SDValue True = DAG.getConstant(1, dl, VT);
10443 SDValue False = DAG.getConstant(0, dl, VT);
10444 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10445 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10446 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, Cmp, DAG);
10447 if (CondCode2 != ARMCC::AL) {
10448 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10449 Result = getCMOV(dl, VT, Result, True, ARMcc, Cmp, DAG);
10450 }
10451 return DAG.getMergeValues({Result, Chain}, dl);
10452}
10453
10454SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10455 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10456
10457 EVT VT = getPointerTy(DAG.getDataLayout());
10458 int FI = MFI.CreateFixedObject(4, 0, false);
10459 return DAG.getFrameIndex(FI, VT);
10460}
10461
10462SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
10463 SelectionDAG &DAG) const {
10464 SDLoc DL(Op);
10465 MakeLibCallOptions CallOptions;
10466 MVT SVT = Op.getOperand(0).getSimpleValueType();
10467 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
10468 SDValue Res =
10469 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
10470 return DAG.getBitcast(MVT::i32, Res);
10471}
10472
10473SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10474 SDLoc dl(Op);
10475 SDValue LHS = Op.getOperand(0);
10476 SDValue RHS = Op.getOperand(1);
10477
10478 // Determine if this is signed or unsigned comparison
10479 bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10480
10481 // Special case for Thumb1 UCMP only
10482 if (!IsSigned && Subtarget->isThumb1Only()) {
10483 // For Thumb unsigned comparison, use this sequence:
10484 // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10485 // sbc r2, r2 ; r2 = r2 - r2 - !carry
10486 // cmp r1, r0 ; compare RHS with LHS
10487 // sbc r1, r1 ; r1 = r1 - r1 - !carry
10488 // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10489
10490 // First subtraction: LHS - RHS
10491 SDValue Sub1WithFlags = DAG.getNode(
10492 ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10493 SDValue Sub1Result = Sub1WithFlags.getValue(0);
10494 SDValue Flags1 = Sub1WithFlags.getValue(1);
10495
10496 // SUBE: Sub1Result - Sub1Result - !carry
10497 // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10498 SDValue Sbc1 =
10499 DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10500 Sub1Result, Sub1Result, Flags1);
10501 SDValue Sbc1Result = Sbc1.getValue(0);
10502
10503 // Second comparison: RHS vs LHS (reverse comparison)
10504 SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10505
10506 // SUBE: RHS - RHS - !carry
10507 // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10508 SDValue Sbc2 = DAG.getNode(
10509 ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10510 SDValue Sbc2Result = Sbc2.getValue(0);
10511
10512 // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10513 SDValue Result =
10514 DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10515 if (Op.getValueType() != MVT::i32)
10516 Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10517
10518 return Result;
10519 }
10520
10521 // For the ARM assembly pattern:
10522 // subs r0, r0, r1 ; subtract RHS from LHS and set flags
10523 // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10524 // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10525 // signed, LO for unsigned)
10526 // ; if LHS == RHS, result remains 0 from the subs
10527
10528 // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10529 unsigned Opcode = ARMISD::SUBC;
10530
10531 // Check if RHS is a subtraction against 0: (0 - X)
10532 if (RHS.getOpcode() == ISD::SUB) {
10533 SDValue SubLHS = RHS.getOperand(0);
10534 SDValue SubRHS = RHS.getOperand(1);
10535
10536 // Check if it's 0 - X
10537 if (isNullConstant(SubLHS)) {
10538 bool CanUseAdd = false;
10539 if (IsSigned) {
10540 // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10541 if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10543 .isMinSignedValue()) {
10544 CanUseAdd = true;
10545 }
10546 } else {
10547 // For UCMP: only if X is known to never be zero
10548 if (DAG.isKnownNeverZero(SubRHS)) {
10549 CanUseAdd = true;
10550 }
10551 }
10552
10553 if (CanUseAdd) {
10554 Opcode = ARMISD::ADDC;
10555 RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10556 // LHS - (0 - X)
10557 }
10558 }
10559 }
10560
10561 // Generate the operation with flags
10562 SDValue OpWithFlags =
10563 DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10564
10565 SDValue OpResult = OpWithFlags.getValue(0);
10566 SDValue Flags = OpWithFlags.getValue(1);
10567
10568 // Constants for conditional moves
10569 SDValue One = DAG.getConstant(1, dl, MVT::i32);
10570 SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10571
10572 // Select condition codes based on signed vs unsigned
10573 ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10574 ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10575
10576 // First conditional move: if greater than, set to 1
10577 SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10578 SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10579 GTCondValue, Flags);
10580
10581 // Second conditional move: if less than, set to -1
10582 SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10583 SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10584 LTCondValue, Flags);
10585
10586 if (Op.getValueType() != MVT::i32)
10587 Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10588
10589 return Result2;
10590}
10591
10593 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10594 switch (Op.getOpcode()) {
10595 default: llvm_unreachable("Don't know how to custom lower this!");
10596 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10597 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10598 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10599 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10600 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10601 case ISD::SELECT: return LowerSELECT(Op, DAG);
10602 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10603 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10604 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10605 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10606 case ISD::VASTART: return LowerVASTART(Op, DAG);
10607 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10608 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10609 case ISD::SINT_TO_FP:
10610 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10613 case ISD::FP_TO_SINT:
10614 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10616 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10617 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10618 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10619 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10620 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10621 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10622 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10623 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10624 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10625 Subtarget);
10626 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10627 case ISD::SHL:
10628 case ISD::SRL:
10629 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10630 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10631 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10632 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10633 case ISD::SRL_PARTS:
10634 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10635 case ISD::CTTZ:
10636 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10637 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10638 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10639 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10640 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10641 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10642 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10643 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10644 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10645 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10646 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10647 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10648 case ISD::SIGN_EXTEND:
10649 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10650 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10651 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10652 case ISD::SET_FPMODE:
10653 return LowerSET_FPMODE(Op, DAG);
10654 case ISD::RESET_FPMODE:
10655 return LowerRESET_FPMODE(Op, DAG);
10656 case ISD::MUL: return LowerMUL(Op, DAG);
10657 case ISD::SDIV:
10658 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10659 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10660 return LowerSDIV(Op, DAG, Subtarget);
10661 case ISD::UDIV:
10662 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10663 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10664 return LowerUDIV(Op, DAG, Subtarget);
10665 case ISD::UADDO_CARRY:
10666 case ISD::USUBO_CARRY:
10667 return LowerUADDSUBO_CARRY(Op, DAG);
10668 case ISD::SADDO:
10669 case ISD::SSUBO:
10670 return LowerSignedALUO(Op, DAG);
10671 case ISD::UADDO:
10672 case ISD::USUBO:
10673 return LowerUnsignedALUO(Op, DAG);
10674 case ISD::SADDSAT:
10675 case ISD::SSUBSAT:
10676 case ISD::UADDSAT:
10677 case ISD::USUBSAT:
10678 return LowerADDSUBSAT(Op, DAG, Subtarget);
10679 case ISD::LOAD:
10680 return LowerPredicateLoad(Op, DAG);
10681 case ISD::STORE:
10682 return LowerSTORE(Op, DAG, Subtarget);
10683 case ISD::MLOAD:
10684 return LowerMLOAD(Op, DAG);
10685 case ISD::VECREDUCE_MUL:
10686 case ISD::VECREDUCE_AND:
10687 case ISD::VECREDUCE_OR:
10688 case ISD::VECREDUCE_XOR:
10689 return LowerVecReduce(Op, DAG, Subtarget);
10690 case ISD::VECREDUCE_FADD:
10691 case ISD::VECREDUCE_FMUL:
10692 case ISD::VECREDUCE_FMIN:
10693 case ISD::VECREDUCE_FMAX:
10694 return LowerVecReduceF(Op, DAG, Subtarget);
10695 case ISD::VECREDUCE_UMIN:
10696 case ISD::VECREDUCE_UMAX:
10697 case ISD::VECREDUCE_SMIN:
10698 case ISD::VECREDUCE_SMAX:
10699 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10700 case ISD::ATOMIC_LOAD:
10701 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
10702 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
10703 case ISD::SDIVREM:
10704 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10705 case ISD::DYNAMIC_STACKALLOC:
10706 if (Subtarget->isTargetWindows())
10707 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10708 llvm_unreachable("Don't know how to custom lower this!");
10710 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10712 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10713 case ISD::STRICT_FSETCC:
10714 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10715 case ISD::SPONENTRY:
10716 return LowerSPONENTRY(Op, DAG);
10717 case ISD::FP_TO_BF16:
10718 return LowerFP_TO_BF16(Op, DAG);
10719 case ARMISD::WIN__DBZCHK: return SDValue();
10720 case ISD::UCMP:
10721 case ISD::SCMP:
10722 return LowerCMP(Op, DAG);
10723 case ISD::ABS:
10724 return LowerABS(Op, DAG);
10725 }
10726}
10727
10729 SelectionDAG &DAG) {
10730 unsigned IntNo = N->getConstantOperandVal(0);
10731 unsigned Opc = 0;
10732 if (IntNo == Intrinsic::arm_smlald)
10734 else if (IntNo == Intrinsic::arm_smlaldx)
10736 else if (IntNo == Intrinsic::arm_smlsld)
10738 else if (IntNo == Intrinsic::arm_smlsldx)
10740 else
10741 return;
10742
10743 SDLoc dl(N);
10744 SDValue Lo, Hi;
10745 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10746
10747 SDValue LongMul = DAG.getNode(Opc, dl,
10748 DAG.getVTList(MVT::i32, MVT::i32),
10749 N->getOperand(1), N->getOperand(2),
10750 Lo, Hi);
10751 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10752 LongMul.getValue(0), LongMul.getValue(1)));
10753}
10754
10755/// ReplaceNodeResults - Replace the results of node with an illegal result
10756/// type with new values built out of custom code.
10759 SelectionDAG &DAG) const {
10760 SDValue Res;
10761 switch (N->getOpcode()) {
10762 default:
10763 llvm_unreachable("Don't know how to custom expand this!");
10764 case ISD::READ_REGISTER:
10766 break;
10767 case ISD::BITCAST:
10768 Res = ExpandBITCAST(N, DAG, Subtarget);
10769 break;
10770 case ISD::SRL:
10771 case ISD::SRA:
10772 case ISD::SHL:
10773 Res = Expand64BitShift(N, DAG, Subtarget);
10774 break;
10775 case ISD::SREM:
10776 case ISD::UREM:
10777 Res = LowerREM(N, DAG);
10778 break;
10779 case ISD::SDIVREM:
10780 case ISD::UDIVREM:
10781 Res = LowerDivRem(SDValue(N, 0), DAG);
10782 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10783 Results.push_back(Res.getValue(0));
10784 Results.push_back(Res.getValue(1));
10785 return;
10786 case ISD::SADDSAT:
10787 case ISD::SSUBSAT:
10788 case ISD::UADDSAT:
10789 case ISD::USUBSAT:
10790 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10791 break;
10792 case ISD::READCYCLECOUNTER:
10793 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10794 return;
10795 case ISD::UDIV:
10796 case ISD::SDIV:
10797 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
10798 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10799 Results);
10800 case ISD::ATOMIC_CMP_SWAP:
10802 return;
10804 return ReplaceLongIntrinsic(N, Results, DAG);
10805 case ISD::LOAD:
10806 LowerLOAD(N, Results, DAG);
10807 break;
10808 case ISD::TRUNCATE:
10809 Res = LowerTruncate(N, DAG, Subtarget);
10810 break;
10811 case ISD::SIGN_EXTEND:
10812 case ISD::ZERO_EXTEND:
10813 Res = LowerVectorExtend(N, DAG, Subtarget);
10814 break;
10817 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10818 break;
10819 }
10820 if (Res.getNode())
10821 Results.push_back(Res);
10822}
10823
10824//===----------------------------------------------------------------------===//
10825// ARM Scheduler Hooks
10826//===----------------------------------------------------------------------===//
10827
10828/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10829/// registers the function context.
10830void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10832 MachineBasicBlock *DispatchBB,
10833 int FI) const {
10834 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10835 "ROPI/RWPI not currently supported with SjLj");
10836 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10837 DebugLoc dl = MI.getDebugLoc();
10838 MachineFunction *MF = MBB->getParent();
10842 const Function &F = MF->getFunction();
10843
10844 bool isThumb = Subtarget->isThumb();
10845 bool isThumb2 = Subtarget->isThumb2();
10846
10847 unsigned PCLabelId = AFI->createPICLabelUId();
10848 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10850 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10851 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10852
10853 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10854 : &ARM::GPRRegClass;
10855
10856 // Grab constant pool and fixed stack memory operands.
10857 MachineMemOperand *CPMMO =
10860
10861 MachineMemOperand *FIMMOSt =
10864
10865 // Load the address of the dispatch MBB into the jump buffer.
10866 if (isThumb2) {
10867 // Incoming value: jbuf
10868 // ldr.n r5, LCPI1_1
10869 // orr r5, r5, #1
10870 // add r5, pc
10871 // str r5, [$jbuf, #+4] ; &jbuf[1]
10872 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10873 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10875 .addMemOperand(CPMMO)
10877 // Set the low bit because of thumb mode.
10878 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10879 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10880 .addReg(NewVReg1, RegState::Kill)
10881 .addImm(0x01)
10883 .add(condCodeOp());
10884 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10885 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10886 .addReg(NewVReg2, RegState::Kill)
10887 .addImm(PCLabelId);
10888 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10889 .addReg(NewVReg3, RegState::Kill)
10890 .addFrameIndex(FI)
10891 .addImm(36) // &jbuf[1] :: pc
10892 .addMemOperand(FIMMOSt)
10894 } else if (isThumb) {
10895 // Incoming value: jbuf
10896 // ldr.n r1, LCPI1_4
10897 // add r1, pc
10898 // mov r2, #1
10899 // orrs r1, r2
10900 // add r2, $jbuf, #+4 ; &jbuf[1]
10901 // str r1, [r2]
10902 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10903 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10905 .addMemOperand(CPMMO)
10907 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10908 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10909 .addReg(NewVReg1, RegState::Kill)
10910 .addImm(PCLabelId);
10911 // Set the low bit because of thumb mode.
10912 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10913 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10914 .addReg(ARM::CPSR, RegState::Define)
10915 .addImm(1)
10917 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10918 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10919 .addReg(ARM::CPSR, RegState::Define)
10920 .addReg(NewVReg2, RegState::Kill)
10921 .addReg(NewVReg3, RegState::Kill)
10923 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10924 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10925 .addFrameIndex(FI)
10926 .addImm(36); // &jbuf[1] :: pc
10927 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10928 .addReg(NewVReg4, RegState::Kill)
10929 .addReg(NewVReg5, RegState::Kill)
10930 .addImm(0)
10931 .addMemOperand(FIMMOSt)
10933 } else {
10934 // Incoming value: jbuf
10935 // ldr r1, LCPI1_1
10936 // add r1, pc, r1
10937 // str r1, [$jbuf, #+4] ; &jbuf[1]
10938 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10939 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10941 .addImm(0)
10942 .addMemOperand(CPMMO)
10944 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10945 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10946 .addReg(NewVReg1, RegState::Kill)
10947 .addImm(PCLabelId)
10949 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10950 .addReg(NewVReg2, RegState::Kill)
10951 .addFrameIndex(FI)
10952 .addImm(36) // &jbuf[1] :: pc
10953 .addMemOperand(FIMMOSt)
10955 }
10956}
10957
10958void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10959 MachineBasicBlock *MBB) const {
10960 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10961 DebugLoc dl = MI.getDebugLoc();
10962 MachineFunction *MF = MBB->getParent();
10963 MachineRegisterInfo *MRI = &MF->getRegInfo();
10964 MachineFrameInfo &MFI = MF->getFrameInfo();
10965 int FI = MFI.getFunctionContextIndex();
10966
10967 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10968 : &ARM::GPRnopcRegClass;
10969
10970 // Get a mapping of the call site numbers to all of the landing pads they're
10971 // associated with.
10972 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
10973 unsigned MaxCSNum = 0;
10974 for (MachineBasicBlock &BB : *MF) {
10975 if (!BB.isEHPad())
10976 continue;
10977
10978 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10979 // pad.
10980 for (MachineInstr &II : BB) {
10981 if (!II.isEHLabel())
10982 continue;
10983
10984 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10985 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10986
10987 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10988 for (unsigned Idx : CallSiteIdxs) {
10989 CallSiteNumToLPad[Idx].push_back(&BB);
10990 MaxCSNum = std::max(MaxCSNum, Idx);
10991 }
10992 break;
10993 }
10994 }
10995
10996 // Get an ordered list of the machine basic blocks for the jump table.
10997 std::vector<MachineBasicBlock*> LPadList;
10998 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
10999 LPadList.reserve(CallSiteNumToLPad.size());
11000 for (unsigned I = 1; I <= MaxCSNum; ++I) {
11001 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
11002 for (MachineBasicBlock *MBB : MBBList) {
11003 LPadList.push_back(MBB);
11004 InvokeBBs.insert_range(MBB->predecessors());
11005 }
11006 }
11007
11008 assert(!LPadList.empty() &&
11009 "No landing pad destinations for the dispatch jump table!");
11010
11011 // Create the jump table and associated information.
11012 MachineJumpTableInfo *JTI =
11013 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
11014 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
11015
11016 // Create the MBBs for the dispatch code.
11017
11018 // Shove the dispatch's address into the return slot in the function context.
11019 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
11020 DispatchBB->setIsEHPad();
11021
11022 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11023
11024 BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
11025 DispatchBB->addSuccessor(TrapBB);
11026
11027 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
11028 DispatchBB->addSuccessor(DispContBB);
11029
11030 // Insert and MBBs.
11031 MF->insert(MF->end(), DispatchBB);
11032 MF->insert(MF->end(), DispContBB);
11033 MF->insert(MF->end(), TrapBB);
11034
11035 // Insert code into the entry block that creates and registers the function
11036 // context.
11037 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
11038
11039 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
11042
11043 MachineInstrBuilder MIB;
11044 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
11045
11046 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
11047 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
11048
11049 // Add a register mask with no preserved registers. This results in all
11050 // registers being marked as clobbered. This can't work if the dispatch block
11051 // is in a Thumb1 function and is linked with ARM code which uses the FP
11052 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
11054
11055 bool IsPositionIndependent = isPositionIndependent();
11056 unsigned NumLPads = LPadList.size();
11057 if (Subtarget->isThumb2()) {
11058 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11059 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
11060 .addFrameIndex(FI)
11061 .addImm(4)
11062 .addMemOperand(FIMMOLd)
11064
11065 if (NumLPads < 256) {
11066 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
11067 .addReg(NewVReg1)
11068 .addImm(LPadList.size())
11070 } else {
11071 Register VReg1 = MRI->createVirtualRegister(TRC);
11072 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
11073 .addImm(NumLPads & 0xFFFF)
11075
11076 unsigned VReg2 = VReg1;
11077 if ((NumLPads & 0xFFFF0000) != 0) {
11078 VReg2 = MRI->createVirtualRegister(TRC);
11079 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
11080 .addReg(VReg1)
11081 .addImm(NumLPads >> 16)
11083 }
11084
11085 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
11086 .addReg(NewVReg1)
11087 .addReg(VReg2)
11089 }
11090
11091 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
11092 .addMBB(TrapBB)
11094 .addReg(ARM::CPSR);
11095
11096 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11097 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
11098 .addJumpTableIndex(MJTI)
11100
11101 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11102 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
11103 .addReg(NewVReg3, RegState::Kill)
11104 .addReg(NewVReg1)
11107 .add(condCodeOp());
11108
11109 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
11110 .addReg(NewVReg4, RegState::Kill)
11111 .addReg(NewVReg1)
11112 .addJumpTableIndex(MJTI);
11113 } else if (Subtarget->isThumb()) {
11114 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11115 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
11116 .addFrameIndex(FI)
11117 .addImm(1)
11118 .addMemOperand(FIMMOLd)
11120
11121 if (NumLPads < 256) {
11122 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
11123 .addReg(NewVReg1)
11124 .addImm(NumLPads)
11126 } else {
11127 MachineConstantPool *ConstantPool = MF->getConstantPool();
11128 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11129 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11130
11131 // MachineConstantPool wants an explicit alignment.
11132 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11133 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11134
11135 Register VReg1 = MRI->createVirtualRegister(TRC);
11136 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
11137 .addReg(VReg1, RegState::Define)
11140 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
11141 .addReg(NewVReg1)
11142 .addReg(VReg1)
11144 }
11145
11146 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
11147 .addMBB(TrapBB)
11149 .addReg(ARM::CPSR);
11150
11151 Register NewVReg2 = MRI->createVirtualRegister(TRC);
11152 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
11153 .addReg(ARM::CPSR, RegState::Define)
11154 .addReg(NewVReg1)
11155 .addImm(2)
11157
11158 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11159 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
11160 .addJumpTableIndex(MJTI)
11162
11163 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11164 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
11165 .addReg(ARM::CPSR, RegState::Define)
11166 .addReg(NewVReg2, RegState::Kill)
11167 .addReg(NewVReg3)
11169
11170 MachineMemOperand *JTMMOLd =
11171 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11173
11174 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11175 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
11176 .addReg(NewVReg4, RegState::Kill)
11177 .addImm(0)
11178 .addMemOperand(JTMMOLd)
11180
11181 unsigned NewVReg6 = NewVReg5;
11182 if (IsPositionIndependent) {
11183 NewVReg6 = MRI->createVirtualRegister(TRC);
11184 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
11185 .addReg(ARM::CPSR, RegState::Define)
11186 .addReg(NewVReg5, RegState::Kill)
11187 .addReg(NewVReg3)
11189 }
11190
11191 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
11192 .addReg(NewVReg6, RegState::Kill)
11193 .addJumpTableIndex(MJTI);
11194 } else {
11195 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11196 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
11197 .addFrameIndex(FI)
11198 .addImm(4)
11199 .addMemOperand(FIMMOLd)
11201
11202 if (NumLPads < 256) {
11203 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
11204 .addReg(NewVReg1)
11205 .addImm(NumLPads)
11207 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
11208 Register VReg1 = MRI->createVirtualRegister(TRC);
11209 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
11210 .addImm(NumLPads & 0xFFFF)
11212
11213 unsigned VReg2 = VReg1;
11214 if ((NumLPads & 0xFFFF0000) != 0) {
11215 VReg2 = MRI->createVirtualRegister(TRC);
11216 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
11217 .addReg(VReg1)
11218 .addImm(NumLPads >> 16)
11220 }
11221
11222 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11223 .addReg(NewVReg1)
11224 .addReg(VReg2)
11226 } else {
11227 MachineConstantPool *ConstantPool = MF->getConstantPool();
11228 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11229 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11230
11231 // MachineConstantPool wants an explicit alignment.
11232 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11233 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11234
11235 Register VReg1 = MRI->createVirtualRegister(TRC);
11236 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
11237 .addReg(VReg1, RegState::Define)
11239 .addImm(0)
11241 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11242 .addReg(NewVReg1)
11243 .addReg(VReg1, RegState::Kill)
11245 }
11246
11247 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
11248 .addMBB(TrapBB)
11250 .addReg(ARM::CPSR);
11251
11252 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11253 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
11254 .addReg(NewVReg1)
11257 .add(condCodeOp());
11258 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11259 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
11260 .addJumpTableIndex(MJTI)
11262
11263 MachineMemOperand *JTMMOLd =
11264 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11266 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11267 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11268 .addReg(NewVReg3, RegState::Kill)
11269 .addReg(NewVReg4)
11270 .addImm(0)
11271 .addMemOperand(JTMMOLd)
11273
11274 if (IsPositionIndependent) {
11275 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11276 .addReg(NewVReg5, RegState::Kill)
11277 .addReg(NewVReg4)
11278 .addJumpTableIndex(MJTI);
11279 } else {
11280 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11281 .addReg(NewVReg5, RegState::Kill)
11282 .addJumpTableIndex(MJTI);
11283 }
11284 }
11285
11286 // Add the jump table entries as successors to the MBB.
11287 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
11288 for (MachineBasicBlock *CurMBB : LPadList) {
11289 if (SeenMBBs.insert(CurMBB).second)
11290 DispContBB->addSuccessor(CurMBB);
11291 }
11292
11293 // N.B. the order the invoke BBs are processed in doesn't matter here.
11294 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11296 for (MachineBasicBlock *BB : InvokeBBs) {
11297
11298 // Remove the landing pad successor from the invoke block and replace it
11299 // with the new dispatch block.
11300 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11301 while (!Successors.empty()) {
11302 MachineBasicBlock *SMBB = Successors.pop_back_val();
11303 if (SMBB->isEHPad()) {
11304 BB->removeSuccessor(SMBB);
11305 MBBLPads.push_back(SMBB);
11306 }
11307 }
11308
11309 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11310 BB->normalizeSuccProbs();
11311
11312 // Find the invoke call and mark all of the callee-saved registers as
11313 // 'implicit defined' so that they're spilled. This prevents code from
11314 // moving instructions to before the EH block, where they will never be
11315 // executed.
11317 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11318 if (!II->isCall()) continue;
11319
11320 DenseSet<unsigned> DefRegs;
11322 OI = II->operands_begin(), OE = II->operands_end();
11323 OI != OE; ++OI) {
11324 if (!OI->isReg()) continue;
11325 DefRegs.insert(OI->getReg());
11326 }
11327
11328 MachineInstrBuilder MIB(*MF, &*II);
11329
11330 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11331 unsigned Reg = SavedRegs[i];
11332 if (Subtarget->isThumb2() &&
11333 !ARM::tGPRRegClass.contains(Reg) &&
11334 !ARM::hGPRRegClass.contains(Reg))
11335 continue;
11336 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11337 continue;
11338 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11339 continue;
11340 if (!DefRegs.contains(Reg))
11342 }
11343
11344 break;
11345 }
11346 }
11347
11348 // Mark all former landing pads as non-landing pads. The dispatch is the only
11349 // landing pad now.
11350 for (MachineBasicBlock *MBBLPad : MBBLPads)
11351 MBBLPad->setIsEHPad(false);
11352
11353 // The instruction is gone now.
11354 MI.eraseFromParent();
11355}
11356
11357static
11359 for (MachineBasicBlock *S : MBB->successors())
11360 if (S != Succ)
11361 return S;
11362 llvm_unreachable("Expecting a BB with two successors!");
11363}
11364
11365/// Return the load opcode for a given load size. If load size >= 8,
11366/// neon opcode will be returned.
11367static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11368 if (LdSize >= 8)
11369 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11370 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11371 if (IsThumb1)
11372 return LdSize == 4 ? ARM::tLDRi
11373 : LdSize == 2 ? ARM::tLDRHi
11374 : LdSize == 1 ? ARM::tLDRBi : 0;
11375 if (IsThumb2)
11376 return LdSize == 4 ? ARM::t2LDR_POST
11377 : LdSize == 2 ? ARM::t2LDRH_POST
11378 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11379 return LdSize == 4 ? ARM::LDR_POST_IMM
11380 : LdSize == 2 ? ARM::LDRH_POST
11381 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11382}
11383
11384/// Return the store opcode for a given store size. If store size >= 8,
11385/// neon opcode will be returned.
11386static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11387 if (StSize >= 8)
11388 return StSize == 16 ? ARM::VST1q32wb_fixed
11389 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11390 if (IsThumb1)
11391 return StSize == 4 ? ARM::tSTRi
11392 : StSize == 2 ? ARM::tSTRHi
11393 : StSize == 1 ? ARM::tSTRBi : 0;
11394 if (IsThumb2)
11395 return StSize == 4 ? ARM::t2STR_POST
11396 : StSize == 2 ? ARM::t2STRH_POST
11397 : StSize == 1 ? ARM::t2STRB_POST : 0;
11398 return StSize == 4 ? ARM::STR_POST_IMM
11399 : StSize == 2 ? ARM::STRH_POST
11400 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11401}
11402
11403/// Emit a post-increment load operation with given size. The instructions
11404/// will be added to BB at Pos.
11406 const TargetInstrInfo *TII, const DebugLoc &dl,
11407 unsigned LdSize, unsigned Data, unsigned AddrIn,
11408 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11409 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11410 assert(LdOpc != 0 && "Should have a load opcode");
11411 if (LdSize >= 8) {
11412 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11413 .addReg(AddrOut, RegState::Define)
11414 .addReg(AddrIn)
11415 .addImm(0)
11417 } else if (IsThumb1) {
11418 // load + update AddrIn
11419 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11420 .addReg(AddrIn)
11421 .addImm(0)
11423 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11424 .add(t1CondCodeOp())
11425 .addReg(AddrIn)
11426 .addImm(LdSize)
11428 } else if (IsThumb2) {
11429 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11430 .addReg(AddrOut, RegState::Define)
11431 .addReg(AddrIn)
11432 .addImm(LdSize)
11434 } else { // arm
11435 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11436 .addReg(AddrOut, RegState::Define)
11437 .addReg(AddrIn)
11438 .addReg(0)
11439 .addImm(LdSize)
11441 }
11442}
11443
11444/// Emit a post-increment store operation with given size. The instructions
11445/// will be added to BB at Pos.
11447 const TargetInstrInfo *TII, const DebugLoc &dl,
11448 unsigned StSize, unsigned Data, unsigned AddrIn,
11449 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11450 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11451 assert(StOpc != 0 && "Should have a store opcode");
11452 if (StSize >= 8) {
11453 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11454 .addReg(AddrIn)
11455 .addImm(0)
11456 .addReg(Data)
11458 } else if (IsThumb1) {
11459 // store + update AddrIn
11460 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11461 .addReg(Data)
11462 .addReg(AddrIn)
11463 .addImm(0)
11465 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11466 .add(t1CondCodeOp())
11467 .addReg(AddrIn)
11468 .addImm(StSize)
11470 } else if (IsThumb2) {
11471 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11472 .addReg(Data)
11473 .addReg(AddrIn)
11474 .addImm(StSize)
11476 } else { // arm
11477 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11478 .addReg(Data)
11479 .addReg(AddrIn)
11480 .addReg(0)
11481 .addImm(StSize)
11483 }
11484}
11485
11487ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11488 MachineBasicBlock *BB) const {
11489 // This pseudo instruction has 3 operands: dst, src, size
11490 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11491 // Otherwise, we will generate unrolled scalar copies.
11492 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11493 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11495
11496 Register dest = MI.getOperand(0).getReg();
11497 Register src = MI.getOperand(1).getReg();
11498 unsigned SizeVal = MI.getOperand(2).getImm();
11499 unsigned Alignment = MI.getOperand(3).getImm();
11500 DebugLoc dl = MI.getDebugLoc();
11501
11502 MachineFunction *MF = BB->getParent();
11503 MachineRegisterInfo &MRI = MF->getRegInfo();
11504 unsigned UnitSize = 0;
11505 const TargetRegisterClass *TRC = nullptr;
11506 const TargetRegisterClass *VecTRC = nullptr;
11507
11508 bool IsThumb1 = Subtarget->isThumb1Only();
11509 bool IsThumb2 = Subtarget->isThumb2();
11510 bool IsThumb = Subtarget->isThumb();
11511
11512 if (Alignment & 1) {
11513 UnitSize = 1;
11514 } else if (Alignment & 2) {
11515 UnitSize = 2;
11516 } else {
11517 // Check whether we can use NEON instructions.
11518 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11519 Subtarget->hasNEON()) {
11520 if ((Alignment % 16 == 0) && SizeVal >= 16)
11521 UnitSize = 16;
11522 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11523 UnitSize = 8;
11524 }
11525 // Can't use NEON instructions.
11526 if (UnitSize == 0)
11527 UnitSize = 4;
11528 }
11529
11530 // Select the correct opcode and register class for unit size load/store
11531 bool IsNeon = UnitSize >= 8;
11532 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11533 if (IsNeon)
11534 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11535 : UnitSize == 8 ? &ARM::DPRRegClass
11536 : nullptr;
11537
11538 unsigned BytesLeft = SizeVal % UnitSize;
11539 unsigned LoopSize = SizeVal - BytesLeft;
11540
11541 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11542 // Use LDR and STR to copy.
11543 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11544 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11545 unsigned srcIn = src;
11546 unsigned destIn = dest;
11547 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11548 Register srcOut = MRI.createVirtualRegister(TRC);
11549 Register destOut = MRI.createVirtualRegister(TRC);
11550 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11551 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11552 IsThumb1, IsThumb2);
11553 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11554 IsThumb1, IsThumb2);
11555 srcIn = srcOut;
11556 destIn = destOut;
11557 }
11558
11559 // Handle the leftover bytes with LDRB and STRB.
11560 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11561 // [destOut] = STRB_POST(scratch, destIn, 1)
11562 for (unsigned i = 0; i < BytesLeft; i++) {
11563 Register srcOut = MRI.createVirtualRegister(TRC);
11564 Register destOut = MRI.createVirtualRegister(TRC);
11565 Register scratch = MRI.createVirtualRegister(TRC);
11566 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11567 IsThumb1, IsThumb2);
11568 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11569 IsThumb1, IsThumb2);
11570 srcIn = srcOut;
11571 destIn = destOut;
11572 }
11573 MI.eraseFromParent(); // The instruction is gone now.
11574 return BB;
11575 }
11576
11577 // Expand the pseudo op to a loop.
11578 // thisMBB:
11579 // ...
11580 // movw varEnd, # --> with thumb2
11581 // movt varEnd, #
11582 // ldrcp varEnd, idx --> without thumb2
11583 // fallthrough --> loopMBB
11584 // loopMBB:
11585 // PHI varPhi, varEnd, varLoop
11586 // PHI srcPhi, src, srcLoop
11587 // PHI destPhi, dst, destLoop
11588 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11589 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11590 // subs varLoop, varPhi, #UnitSize
11591 // bne loopMBB
11592 // fallthrough --> exitMBB
11593 // exitMBB:
11594 // epilogue to handle left-over bytes
11595 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11596 // [destOut] = STRB_POST(scratch, destLoop, 1)
11597 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11598 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11599 MF->insert(It, loopMBB);
11600 MF->insert(It, exitMBB);
11601
11602 // Set the call frame size on entry to the new basic blocks.
11603 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11604 loopMBB->setCallFrameSize(CallFrameSize);
11605 exitMBB->setCallFrameSize(CallFrameSize);
11606
11607 // Transfer the remainder of BB and its successor edges to exitMBB.
11608 exitMBB->splice(exitMBB->begin(), BB,
11609 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11611
11612 // Load an immediate to varEnd.
11613 Register varEnd = MRI.createVirtualRegister(TRC);
11614 if (Subtarget->useMovt()) {
11615 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11616 varEnd)
11617 .addImm(LoopSize);
11618 } else if (Subtarget->genExecuteOnly()) {
11619 assert(IsThumb && "Non-thumb expected to have used movt");
11620 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11621 } else {
11622 MachineConstantPool *ConstantPool = MF->getConstantPool();
11624 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11625
11626 // MachineConstantPool wants an explicit alignment.
11627 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11628 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11629 MachineMemOperand *CPMMO =
11632
11633 if (IsThumb)
11634 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11635 .addReg(varEnd, RegState::Define)
11638 .addMemOperand(CPMMO);
11639 else
11640 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11641 .addReg(varEnd, RegState::Define)
11643 .addImm(0)
11645 .addMemOperand(CPMMO);
11646 }
11647 BB->addSuccessor(loopMBB);
11648
11649 // Generate the loop body:
11650 // varPhi = PHI(varLoop, varEnd)
11651 // srcPhi = PHI(srcLoop, src)
11652 // destPhi = PHI(destLoop, dst)
11653 MachineBasicBlock *entryBB = BB;
11654 BB = loopMBB;
11655 Register varLoop = MRI.createVirtualRegister(TRC);
11656 Register varPhi = MRI.createVirtualRegister(TRC);
11657 Register srcLoop = MRI.createVirtualRegister(TRC);
11658 Register srcPhi = MRI.createVirtualRegister(TRC);
11659 Register destLoop = MRI.createVirtualRegister(TRC);
11660 Register destPhi = MRI.createVirtualRegister(TRC);
11661
11662 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11663 .addReg(varLoop).addMBB(loopMBB)
11664 .addReg(varEnd).addMBB(entryBB);
11665 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11666 .addReg(srcLoop).addMBB(loopMBB)
11667 .addReg(src).addMBB(entryBB);
11668 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11669 .addReg(destLoop).addMBB(loopMBB)
11670 .addReg(dest).addMBB(entryBB);
11671
11672 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11673 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11674 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11675 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11676 IsThumb1, IsThumb2);
11677 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11678 IsThumb1, IsThumb2);
11679
11680 // Decrement loop variable by UnitSize.
11681 if (IsThumb1) {
11682 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11683 .add(t1CondCodeOp())
11684 .addReg(varPhi)
11685 .addImm(UnitSize)
11687 } else {
11688 MachineInstrBuilder MIB =
11689 BuildMI(*BB, BB->end(), dl,
11690 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11691 MIB.addReg(varPhi)
11692 .addImm(UnitSize)
11694 .add(condCodeOp());
11695 MIB->getOperand(5).setReg(ARM::CPSR);
11696 MIB->getOperand(5).setIsDef(true);
11697 }
11698 BuildMI(*BB, BB->end(), dl,
11699 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11700 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11701
11702 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11703 BB->addSuccessor(loopMBB);
11704 BB->addSuccessor(exitMBB);
11705
11706 // Add epilogue to handle BytesLeft.
11707 BB = exitMBB;
11708 auto StartOfExit = exitMBB->begin();
11709
11710 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11711 // [destOut] = STRB_POST(scratch, destLoop, 1)
11712 unsigned srcIn = srcLoop;
11713 unsigned destIn = destLoop;
11714 for (unsigned i = 0; i < BytesLeft; i++) {
11715 Register srcOut = MRI.createVirtualRegister(TRC);
11716 Register destOut = MRI.createVirtualRegister(TRC);
11717 Register scratch = MRI.createVirtualRegister(TRC);
11718 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11719 IsThumb1, IsThumb2);
11720 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11721 IsThumb1, IsThumb2);
11722 srcIn = srcOut;
11723 destIn = destOut;
11724 }
11725
11726 MI.eraseFromParent(); // The instruction is gone now.
11727 return BB;
11728}
11729
11731ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11732 MachineBasicBlock *MBB) const {
11733 const TargetMachine &TM = getTargetMachine();
11734 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11735 DebugLoc DL = MI.getDebugLoc();
11736
11737 assert(Subtarget->isTargetWindows() &&
11738 "__chkstk is only supported on Windows");
11739 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11740
11741 // __chkstk takes the number of words to allocate on the stack in R4, and
11742 // returns the stack adjustment in number of bytes in R4. This will not
11743 // clober any other registers (other than the obvious lr).
11744 //
11745 // Although, technically, IP should be considered a register which may be
11746 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11747 // thumb-2 environment, so there is no interworking required. As a result, we
11748 // do not expect a veneer to be emitted by the linker, clobbering IP.
11749 //
11750 // Each module receives its own copy of __chkstk, so no import thunk is
11751 // required, again, ensuring that IP is not clobbered.
11752 //
11753 // Finally, although some linkers may theoretically provide a trampoline for
11754 // out of range calls (which is quite common due to a 32M range limitation of
11755 // branches for Thumb), we can generate the long-call version via
11756 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11757 // IP.
11758
11759 switch (TM.getCodeModel()) {
11760 case CodeModel::Tiny:
11761 llvm_unreachable("Tiny code model not available on ARM.");
11762 case CodeModel::Small:
11763 case CodeModel::Medium:
11764 case CodeModel::Kernel:
11765 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11767 .addExternalSymbol("__chkstk")
11770 .addReg(ARM::R12,
11772 .addReg(ARM::CPSR,
11774 break;
11775 case CodeModel::Large: {
11776 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
11777 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11778
11779 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11780 .addExternalSymbol("__chkstk");
11786 .addReg(ARM::R12,
11788 .addReg(ARM::CPSR,
11790 break;
11791 }
11792 }
11793
11794 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11795 .addReg(ARM::SP, RegState::Kill)
11796 .addReg(ARM::R4, RegState::Kill)
11799 .add(condCodeOp());
11800
11801 MI.eraseFromParent();
11802 return MBB;
11803}
11804
11806ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11807 MachineBasicBlock *MBB) const {
11808 DebugLoc DL = MI.getDebugLoc();
11809 MachineFunction *MF = MBB->getParent();
11810 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11811
11812 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
11813 MF->insert(++MBB->getIterator(), ContBB);
11814 ContBB->splice(ContBB->begin(), MBB,
11815 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11817 MBB->addSuccessor(ContBB);
11818
11819 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11820 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11821 MF->push_back(TrapBB);
11822 MBB->addSuccessor(TrapBB);
11823
11824 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11825 .addReg(MI.getOperand(0).getReg())
11826 .addImm(0)
11828 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11829 .addMBB(TrapBB)
11831 .addReg(ARM::CPSR);
11832
11833 MI.eraseFromParent();
11834 return ContBB;
11835}
11836
11837// The CPSR operand of SelectItr might be missing a kill marker
11838// because there were multiple uses of CPSR, and ISel didn't know
11839// which to mark. Figure out whether SelectItr should have had a
11840// kill marker, and set it if it should. Returns the correct kill
11841// marker value.
11844 const TargetRegisterInfo* TRI) {
11845 // Scan forward through BB for a use/def of CPSR.
11846 MachineBasicBlock::iterator miI(std::next(SelectItr));
11847 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11848 const MachineInstr& mi = *miI;
11849 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11850 return false;
11851 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11852 break; // Should have kill-flag - update below.
11853 }
11854
11855 // If we hit the end of the block, check whether CPSR is live into a
11856 // successor.
11857 if (miI == BB->end()) {
11858 for (MachineBasicBlock *Succ : BB->successors())
11859 if (Succ->isLiveIn(ARM::CPSR))
11860 return false;
11861 }
11862
11863 // We found a def, or hit the end of the basic block and CPSR wasn't live
11864 // out. SelectMI should have a kill flag on CPSR.
11865 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11866 return true;
11867}
11868
11869/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11870/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11872 MachineBasicBlock *TpLoopBody,
11873 MachineBasicBlock *TpExit, Register OpSizeReg,
11874 const TargetInstrInfo *TII, DebugLoc Dl,
11876 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11877 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11878 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11879 .addUse(OpSizeReg)
11880 .addImm(15)
11882 .addReg(0);
11883
11884 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11885 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11886 .addUse(AddDestReg, RegState::Kill)
11887 .addImm(4)
11889 .addReg(0);
11890
11891 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11892 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11893 .addUse(LsrDestReg, RegState::Kill);
11894
11895 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11896 .addUse(TotalIterationsReg)
11897 .addMBB(TpExit);
11898
11899 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11900 .addMBB(TpLoopBody)
11902
11903 return TotalIterationsReg;
11904}
11905
11906/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11907/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11908/// loops.
11909static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11910 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11911 const TargetInstrInfo *TII, DebugLoc Dl,
11912 MachineRegisterInfo &MRI, Register OpSrcReg,
11913 Register OpDestReg, Register ElementCountReg,
11914 Register TotalIterationsReg, bool IsMemcpy) {
11915 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11916 // array, loop iteration counter, predication counter.
11917
11918 Register SrcPhiReg, CurrSrcReg;
11919 if (IsMemcpy) {
11920 // Current position in the src array
11921 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11922 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11923 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11924 .addUse(OpSrcReg)
11925 .addMBB(TpEntry)
11926 .addUse(CurrSrcReg)
11927 .addMBB(TpLoopBody);
11928 }
11929
11930 // Current position in the dest array
11931 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11932 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11933 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11934 .addUse(OpDestReg)
11935 .addMBB(TpEntry)
11936 .addUse(CurrDestReg)
11937 .addMBB(TpLoopBody);
11938
11939 // Current loop counter
11940 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11941 Register RemainingLoopIterationsReg =
11942 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11943 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11944 .addUse(TotalIterationsReg)
11945 .addMBB(TpEntry)
11946 .addUse(RemainingLoopIterationsReg)
11947 .addMBB(TpLoopBody);
11948
11949 // Predication counter
11950 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11951 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11952 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11953 .addUse(ElementCountReg)
11954 .addMBB(TpEntry)
11955 .addUse(RemainingElementsReg)
11956 .addMBB(TpLoopBody);
11957
11958 // Pass predication counter to VCTP
11959 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11960 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11961 .addUse(PredCounterPhiReg)
11963 .addReg(0)
11964 .addReg(0);
11965
11966 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11967 .addUse(PredCounterPhiReg)
11968 .addImm(16)
11970 .addReg(0);
11971
11972 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11973 Register SrcValueReg;
11974 if (IsMemcpy) {
11975 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11976 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11977 .addDef(CurrSrcReg)
11978 .addDef(SrcValueReg)
11979 .addReg(SrcPhiReg)
11980 .addImm(16)
11982 .addUse(VccrReg)
11983 .addReg(0);
11984 } else
11985 SrcValueReg = OpSrcReg;
11986
11987 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11988 .addDef(CurrDestReg)
11989 .addUse(SrcValueReg)
11990 .addReg(DestPhiReg)
11991 .addImm(16)
11993 .addUse(VccrReg)
11994 .addReg(0);
11995
11996 // Add the pseudoInstrs for decrementing the loop counter and marking the
11997 // end:t2DoLoopDec and t2DoLoopEnd
11998 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11999 .addUse(LoopCounterPhiReg)
12000 .addImm(1);
12001
12002 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
12003 .addUse(RemainingLoopIterationsReg)
12004 .addMBB(TpLoopBody);
12005
12006 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
12007 .addMBB(TpExit)
12009}
12010
12013 MachineBasicBlock *BB) const {
12014 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
12015 DebugLoc dl = MI.getDebugLoc();
12016 bool isThumb2 = Subtarget->isThumb2();
12017 switch (MI.getOpcode()) {
12018 default: {
12019 MI.print(errs());
12020 llvm_unreachable("Unexpected instr type to insert");
12021 }
12022
12023 // Thumb1 post-indexed loads are really just single-register LDMs.
12024 case ARM::tLDR_postidx: {
12025 MachineOperand Def(MI.getOperand(1));
12026 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
12027 .add(Def) // Rn_wb
12028 .add(MI.getOperand(2)) // Rn
12029 .add(MI.getOperand(3)) // PredImm
12030 .add(MI.getOperand(4)) // PredReg
12031 .add(MI.getOperand(0)) // Rt
12032 .cloneMemRefs(MI);
12033 MI.eraseFromParent();
12034 return BB;
12035 }
12036
12037 case ARM::MVE_MEMCPYLOOPINST:
12038 case ARM::MVE_MEMSETLOOPINST: {
12039
12040 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
12041 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
12042 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
12043 // adds the relevant instructions in the TP loop Body for generation of a
12044 // WLSTP loop.
12045
12046 // Below is relevant portion of the CFG after the transformation.
12047 // The Machine Basic Blocks are shown along with branch conditions (in
12048 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
12049 // portion of the CFG and may not necessarily be the entry/exit of the
12050 // function.
12051
12052 // (Relevant) CFG after transformation:
12053 // TP entry MBB
12054 // |
12055 // |-----------------|
12056 // (n <= 0) (n > 0)
12057 // | |
12058 // | TP loop Body MBB<--|
12059 // | | |
12060 // \ |___________|
12061 // \ /
12062 // TP exit MBB
12063
12064 MachineFunction *MF = BB->getParent();
12065 MachineFunctionProperties &Properties = MF->getProperties();
12067
12068 Register OpDestReg = MI.getOperand(0).getReg();
12069 Register OpSrcReg = MI.getOperand(1).getReg();
12070 Register OpSizeReg = MI.getOperand(2).getReg();
12071
12072 // Allocate the required MBBs and add to parent function.
12073 MachineBasicBlock *TpEntry = BB;
12074 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
12075 MachineBasicBlock *TpExit;
12076
12077 MF->push_back(TpLoopBody);
12078
12079 // If any instructions are present in the current block after
12080 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
12081 // move the instructions into the newly created exit block. If there are no
12082 // instructions add an explicit branch to the FallThrough block and then
12083 // split.
12084 //
12085 // The split is required for two reasons:
12086 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
12087 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
12088 // need to be updated. splitAt() already handles this.
12089 TpExit = BB->splitAt(MI, false);
12090 if (TpExit == BB) {
12091 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
12092 "block containing memcpy/memset Pseudo");
12093 TpExit = BB->getFallThrough();
12094 BuildMI(BB, dl, TII->get(ARM::t2B))
12095 .addMBB(TpExit)
12097 TpExit = BB->splitAt(MI, false);
12098 }
12099
12100 // Add logic for iteration count
12101 Register TotalIterationsReg =
12102 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
12103
12104 // Add the vectorized (and predicated) loads/store instructions
12105 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
12106 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
12107 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
12108
12109 // Required to avoid conflict with the MachineVerifier during testing.
12110 Properties.resetNoPHIs();
12111
12112 // Connect the blocks
12113 TpEntry->addSuccessor(TpLoopBody);
12114 TpLoopBody->addSuccessor(TpLoopBody);
12115 TpLoopBody->addSuccessor(TpExit);
12116
12117 // Reorder for a more natural layout
12118 TpLoopBody->moveAfter(TpEntry);
12119 TpExit->moveAfter(TpLoopBody);
12120
12121 // Finally, remove the memcpy Pseudo Instruction
12122 MI.eraseFromParent();
12123
12124 // Return the exit block as it may contain other instructions requiring a
12125 // custom inserter
12126 return TpExit;
12127 }
12128
12129 // The Thumb2 pre-indexed stores have the same MI operands, they just
12130 // define them differently in the .td files from the isel patterns, so
12131 // they need pseudos.
12132 case ARM::t2STR_preidx:
12133 MI.setDesc(TII->get(ARM::t2STR_PRE));
12134 return BB;
12135 case ARM::t2STRB_preidx:
12136 MI.setDesc(TII->get(ARM::t2STRB_PRE));
12137 return BB;
12138 case ARM::t2STRH_preidx:
12139 MI.setDesc(TII->get(ARM::t2STRH_PRE));
12140 return BB;
12141
12142 case ARM::STRi_preidx:
12143 case ARM::STRBi_preidx: {
12144 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
12145 : ARM::STRB_PRE_IMM;
12146 // Decode the offset.
12147 unsigned Offset = MI.getOperand(4).getImm();
12148 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
12150 if (isSub)
12151 Offset = -Offset;
12152
12153 MachineMemOperand *MMO = *MI.memoperands_begin();
12154 BuildMI(*BB, MI, dl, TII->get(NewOpc))
12155 .add(MI.getOperand(0)) // Rn_wb
12156 .add(MI.getOperand(1)) // Rt
12157 .add(MI.getOperand(2)) // Rn
12158 .addImm(Offset) // offset (skip GPR==zero_reg)
12159 .add(MI.getOperand(5)) // pred
12160 .add(MI.getOperand(6))
12161 .addMemOperand(MMO);
12162 MI.eraseFromParent();
12163 return BB;
12164 }
12165 case ARM::STRr_preidx:
12166 case ARM::STRBr_preidx:
12167 case ARM::STRH_preidx: {
12168 unsigned NewOpc;
12169 switch (MI.getOpcode()) {
12170 default: llvm_unreachable("unexpected opcode!");
12171 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
12172 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
12173 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
12174 }
12175 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
12176 for (const MachineOperand &MO : MI.operands())
12177 MIB.add(MO);
12178 MI.eraseFromParent();
12179 return BB;
12180 }
12181
12182 case ARM::tMOVCCr_pseudo: {
12183 // To "insert" a SELECT_CC instruction, we actually have to insert the
12184 // diamond control-flow pattern. The incoming instruction knows the
12185 // destination vreg to set, the condition code register to branch on, the
12186 // true/false values to select between, and a branch opcode to use.
12187 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12189
12190 // thisMBB:
12191 // ...
12192 // TrueVal = ...
12193 // cmpTY ccX, r1, r2
12194 // bCC copy1MBB
12195 // fallthrough --> copy0MBB
12196 MachineBasicBlock *thisMBB = BB;
12197 MachineFunction *F = BB->getParent();
12198 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12199 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12200 F->insert(It, copy0MBB);
12201 F->insert(It, sinkMBB);
12202
12203 // Set the call frame size on entry to the new basic blocks.
12204 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12205 copy0MBB->setCallFrameSize(CallFrameSize);
12206 sinkMBB->setCallFrameSize(CallFrameSize);
12207
12208 // Check whether CPSR is live past the tMOVCCr_pseudo.
12209 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12210 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12211 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12212 copy0MBB->addLiveIn(ARM::CPSR);
12213 sinkMBB->addLiveIn(ARM::CPSR);
12214 }
12215
12216 // Transfer the remainder of BB and its successor edges to sinkMBB.
12217 sinkMBB->splice(sinkMBB->begin(), BB,
12218 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12220
12221 BB->addSuccessor(copy0MBB);
12222 BB->addSuccessor(sinkMBB);
12223
12224 BuildMI(BB, dl, TII->get(ARM::tBcc))
12225 .addMBB(sinkMBB)
12226 .addImm(MI.getOperand(3).getImm())
12227 .addReg(MI.getOperand(4).getReg());
12228
12229 // copy0MBB:
12230 // %FalseValue = ...
12231 // # fallthrough to sinkMBB
12232 BB = copy0MBB;
12233
12234 // Update machine-CFG edges
12235 BB->addSuccessor(sinkMBB);
12236
12237 // sinkMBB:
12238 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12239 // ...
12240 BB = sinkMBB;
12241 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12242 .addReg(MI.getOperand(1).getReg())
12243 .addMBB(copy0MBB)
12244 .addReg(MI.getOperand(2).getReg())
12245 .addMBB(thisMBB);
12246
12247 MI.eraseFromParent(); // The pseudo instruction is gone now.
12248 return BB;
12249 }
12250
12251 case ARM::BCCi64:
12252 case ARM::BCCZi64: {
12253 // If there is an unconditional branch to the other successor, remove it.
12254 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12255
12256 // Compare both parts that make up the double comparison separately for
12257 // equality.
12258 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12259
12260 Register LHS1 = MI.getOperand(1).getReg();
12261 Register LHS2 = MI.getOperand(2).getReg();
12262 if (RHSisZero) {
12263 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12264 .addReg(LHS1)
12265 .addImm(0)
12267 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12268 .addReg(LHS2).addImm(0)
12269 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12270 } else {
12271 Register RHS1 = MI.getOperand(3).getReg();
12272 Register RHS2 = MI.getOperand(4).getReg();
12273 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12274 .addReg(LHS1)
12275 .addReg(RHS1)
12277 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12278 .addReg(LHS2).addReg(RHS2)
12279 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12280 }
12281
12282 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12283 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12284 if (MI.getOperand(0).getImm() == ARMCC::NE)
12285 std::swap(destMBB, exitMBB);
12286
12287 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12288 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12289 if (isThumb2)
12290 BuildMI(BB, dl, TII->get(ARM::t2B))
12291 .addMBB(exitMBB)
12293 else
12294 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12295
12296 MI.eraseFromParent(); // The pseudo instruction is gone now.
12297 return BB;
12298 }
12299
12300 case ARM::Int_eh_sjlj_setjmp:
12301 case ARM::Int_eh_sjlj_setjmp_nofp:
12302 case ARM::tInt_eh_sjlj_setjmp:
12303 case ARM::t2Int_eh_sjlj_setjmp:
12304 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12305 return BB;
12306
12307 case ARM::Int_eh_sjlj_setup_dispatch:
12308 EmitSjLjDispatchBlock(MI, BB);
12309 return BB;
12310 case ARM::COPY_STRUCT_BYVAL_I32:
12311 ++NumLoopByVals;
12312 return EmitStructByval(MI, BB);
12313 case ARM::WIN__CHKSTK:
12314 return EmitLowered__chkstk(MI, BB);
12315 case ARM::WIN__DBZCHK:
12316 return EmitLowered__dbzchk(MI, BB);
12317 }
12318}
12319
12320/// Attaches vregs to MEMCPY that it will use as scratch registers
12321/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12322/// instead of as a custom inserter because we need the use list from the SDNode.
12323static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12324 MachineInstr &MI, const SDNode *Node) {
12325 bool isThumb1 = Subtarget->isThumb1Only();
12326
12327 MachineFunction *MF = MI.getParent()->getParent();
12329 MachineInstrBuilder MIB(*MF, MI);
12330
12331 // If the new dst/src is unused mark it as dead.
12332 if (!Node->hasAnyUseOfValue(0)) {
12333 MI.getOperand(0).setIsDead(true);
12334 }
12335 if (!Node->hasAnyUseOfValue(1)) {
12336 MI.getOperand(1).setIsDead(true);
12337 }
12338
12339 // The MEMCPY both defines and kills the scratch registers.
12340 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12341 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12342 : &ARM::GPRRegClass);
12344 }
12345}
12346
12348 SDNode *Node) const {
12349 if (MI.getOpcode() == ARM::MEMCPY) {
12350 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12351 return;
12352 }
12353
12354 const MCInstrDesc *MCID = &MI.getDesc();
12355 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12356 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12357 // operand is still set to noreg. If needed, set the optional operand's
12358 // register to CPSR, and remove the redundant implicit def.
12359 //
12360 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12361
12362 // Rename pseudo opcodes.
12363 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12364 unsigned ccOutIdx;
12365 if (NewOpc) {
12366 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12367 MCID = &TII->get(NewOpc);
12368
12369 assert(MCID->getNumOperands() ==
12370 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12371 && "converted opcode should be the same except for cc_out"
12372 " (and, on Thumb1, pred)");
12373
12374 MI.setDesc(*MCID);
12375
12376 // Add the optional cc_out operand
12377 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12378
12379 // On Thumb1, move all input operands to the end, then add the predicate
12380 if (Subtarget->isThumb1Only()) {
12381 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12382 MI.addOperand(MI.getOperand(1));
12383 MI.removeOperand(1);
12384 }
12385
12386 // Restore the ties
12387 for (unsigned i = MI.getNumOperands(); i--;) {
12388 const MachineOperand& op = MI.getOperand(i);
12389 if (op.isReg() && op.isUse()) {
12390 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12391 if (DefIdx != -1)
12392 MI.tieOperands(DefIdx, i);
12393 }
12394 }
12395
12397 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12398 ccOutIdx = 1;
12399 } else
12400 ccOutIdx = MCID->getNumOperands() - 1;
12401 } else
12402 ccOutIdx = MCID->getNumOperands() - 1;
12403
12404 // Any ARM instruction that sets the 's' bit should specify an optional
12405 // "cc_out" operand in the last operand position.
12406 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12407 assert(!NewOpc && "Optional cc_out operand required");
12408 return;
12409 }
12410 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12411 // since we already have an optional CPSR def.
12412 bool definesCPSR = false;
12413 bool deadCPSR = false;
12414 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12415 ++i) {
12416 const MachineOperand &MO = MI.getOperand(i);
12417 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12418 definesCPSR = true;
12419 if (MO.isDead())
12420 deadCPSR = true;
12421 MI.removeOperand(i);
12422 break;
12423 }
12424 }
12425 if (!definesCPSR) {
12426 assert(!NewOpc && "Optional cc_out operand required");
12427 return;
12428 }
12429 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12430 if (deadCPSR) {
12431 assert(!MI.getOperand(ccOutIdx).getReg() &&
12432 "expect uninitialized optional cc_out operand");
12433 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12434 if (!Subtarget->isThumb1Only())
12435 return;
12436 }
12437
12438 // If this instruction was defined with an optional CPSR def and its dag node
12439 // had a live implicit CPSR def, then activate the optional CPSR def.
12440 MachineOperand &MO = MI.getOperand(ccOutIdx);
12441 MO.setReg(ARM::CPSR);
12442 MO.setIsDef(true);
12443}
12444
12445//===----------------------------------------------------------------------===//
12446// ARM Optimization Hooks
12447//===----------------------------------------------------------------------===//
12448
12449// Helper function that checks if N is a null or all ones constant.
12450static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12452}
12453
12454// Return true if N is conditionally 0 or all ones.
12455// Detects these expressions where cc is an i1 value:
12456//
12457// (select cc 0, y) [AllOnes=0]
12458// (select cc y, 0) [AllOnes=0]
12459// (zext cc) [AllOnes=0]
12460// (sext cc) [AllOnes=0/1]
12461// (select cc -1, y) [AllOnes=1]
12462// (select cc y, -1) [AllOnes=1]
12463//
12464// Invert is set when N is the null/all ones constant when CC is false.
12465// OtherOp is set to the alternative value of N.
12467 SDValue &CC, bool &Invert,
12468 SDValue &OtherOp,
12469 SelectionDAG &DAG) {
12470 switch (N->getOpcode()) {
12471 default: return false;
12472 case ISD::SELECT: {
12473 CC = N->getOperand(0);
12474 SDValue N1 = N->getOperand(1);
12475 SDValue N2 = N->getOperand(2);
12476 if (isZeroOrAllOnes(N1, AllOnes)) {
12477 Invert = false;
12478 OtherOp = N2;
12479 return true;
12480 }
12481 if (isZeroOrAllOnes(N2, AllOnes)) {
12482 Invert = true;
12483 OtherOp = N1;
12484 return true;
12485 }
12486 return false;
12487 }
12488 case ISD::ZERO_EXTEND:
12489 // (zext cc) can never be the all ones value.
12490 if (AllOnes)
12491 return false;
12492 [[fallthrough]];
12493 case ISD::SIGN_EXTEND: {
12494 SDLoc dl(N);
12495 EVT VT = N->getValueType(0);
12496 CC = N->getOperand(0);
12497 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12498 return false;
12499 Invert = !AllOnes;
12500 if (AllOnes)
12501 // When looking for an AllOnes constant, N is an sext, and the 'other'
12502 // value is 0.
12503 OtherOp = DAG.getConstant(0, dl, VT);
12504 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12505 // When looking for a 0 constant, N can be zext or sext.
12506 OtherOp = DAG.getConstant(1, dl, VT);
12507 else
12508 OtherOp = DAG.getAllOnesConstant(dl, VT);
12509 return true;
12510 }
12511 }
12512}
12513
12514// Combine a constant select operand into its use:
12515//
12516// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12517// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12518// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12519// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12520// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12521//
12522// The transform is rejected if the select doesn't have a constant operand that
12523// is null, or all ones when AllOnes is set.
12524//
12525// Also recognize sext/zext from i1:
12526//
12527// (add (zext cc), x) -> (select cc (add x, 1), x)
12528// (add (sext cc), x) -> (select cc (add x, -1), x)
12529//
12530// These transformations eventually create predicated instructions.
12531//
12532// @param N The node to transform.
12533// @param Slct The N operand that is a select.
12534// @param OtherOp The other N operand (x above).
12535// @param DCI Context.
12536// @param AllOnes Require the select constant to be all ones instead of null.
12537// @returns The new node, or SDValue() on failure.
12538static
12541 bool AllOnes = false) {
12542 SelectionDAG &DAG = DCI.DAG;
12543 EVT VT = N->getValueType(0);
12544 SDValue NonConstantVal;
12545 SDValue CCOp;
12546 bool SwapSelectOps;
12547 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12548 NonConstantVal, DAG))
12549 return SDValue();
12550
12551 // Slct is now know to be the desired identity constant when CC is true.
12552 SDValue TrueVal = OtherOp;
12553 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12554 OtherOp, NonConstantVal);
12555 // Unless SwapSelectOps says CC should be false.
12556 if (SwapSelectOps)
12557 std::swap(TrueVal, FalseVal);
12558
12559 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12560 CCOp, TrueVal, FalseVal);
12561}
12562
12563// Attempt combineSelectAndUse on each operand of a commutative operator N.
12564static
12567 SDValue N0 = N->getOperand(0);
12568 SDValue N1 = N->getOperand(1);
12569 if (N0.getNode()->hasOneUse())
12570 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12571 return Result;
12572 if (N1.getNode()->hasOneUse())
12573 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12574 return Result;
12575 return SDValue();
12576}
12577
12579 // VUZP shuffle node.
12580 if (N->getOpcode() == ARMISD::VUZP)
12581 return true;
12582
12583 // "VUZP" on i32 is an alias for VTRN.
12584 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12585 return true;
12586
12587 return false;
12588}
12589
12592 const ARMSubtarget *Subtarget) {
12593 // Look for ADD(VUZP.0, VUZP.1).
12594 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12595 N0 == N1)
12596 return SDValue();
12597
12598 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12599 if (!N->getValueType(0).is64BitVector())
12600 return SDValue();
12601
12602 // Generate vpadd.
12603 SelectionDAG &DAG = DCI.DAG;
12604 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12605 SDLoc dl(N);
12606 SDNode *Unzip = N0.getNode();
12607 EVT VT = N->getValueType(0);
12608
12610 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12611 TLI.getPointerTy(DAG.getDataLayout())));
12612 Ops.push_back(Unzip->getOperand(0));
12613 Ops.push_back(Unzip->getOperand(1));
12614
12615 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12616}
12617
12620 const ARMSubtarget *Subtarget) {
12621 // Check for two extended operands.
12622 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12623 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12624 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12625 N1.getOpcode() == ISD::ZERO_EXTEND))
12626 return SDValue();
12627
12628 SDValue N00 = N0.getOperand(0);
12629 SDValue N10 = N1.getOperand(0);
12630
12631 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12632 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12633 N00 == N10)
12634 return SDValue();
12635
12636 // We only recognize Q register paddl here; this can't be reached until
12637 // after type legalization.
12638 if (!N00.getValueType().is64BitVector() ||
12640 return SDValue();
12641
12642 // Generate vpaddl.
12643 SelectionDAG &DAG = DCI.DAG;
12644 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12645 SDLoc dl(N);
12646 EVT VT = N->getValueType(0);
12647
12649 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12650 unsigned Opcode;
12651 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12652 Opcode = Intrinsic::arm_neon_vpaddls;
12653 else
12654 Opcode = Intrinsic::arm_neon_vpaddlu;
12655 Ops.push_back(DAG.getConstant(Opcode, dl,
12656 TLI.getPointerTy(DAG.getDataLayout())));
12657 EVT ElemTy = N00.getValueType().getVectorElementType();
12658 unsigned NumElts = VT.getVectorNumElements();
12659 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12660 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12661 N00.getOperand(0), N00.getOperand(1));
12662 Ops.push_back(Concat);
12663
12664 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12665}
12666
12667// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12668// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12669// much easier to match.
12670static SDValue
12673 const ARMSubtarget *Subtarget) {
12674 // Only perform optimization if after legalize, and if NEON is available. We
12675 // also expected both operands to be BUILD_VECTORs.
12676 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12677 || N0.getOpcode() != ISD::BUILD_VECTOR
12678 || N1.getOpcode() != ISD::BUILD_VECTOR)
12679 return SDValue();
12680
12681 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12682 EVT VT = N->getValueType(0);
12683 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12684 return SDValue();
12685
12686 // Check that the vector operands are of the right form.
12687 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12688 // operands, where N is the size of the formed vector.
12689 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12690 // index such that we have a pair wise add pattern.
12691
12692 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12694 return SDValue();
12695 SDValue Vec = N0->getOperand(0)->getOperand(0);
12696 SDNode *V = Vec.getNode();
12697 unsigned nextIndex = 0;
12698
12699 // For each operands to the ADD which are BUILD_VECTORs,
12700 // check to see if each of their operands are an EXTRACT_VECTOR with
12701 // the same vector and appropriate index.
12702 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12705
12706 SDValue ExtVec0 = N0->getOperand(i);
12707 SDValue ExtVec1 = N1->getOperand(i);
12708
12709 // First operand is the vector, verify its the same.
12710 if (V != ExtVec0->getOperand(0).getNode() ||
12711 V != ExtVec1->getOperand(0).getNode())
12712 return SDValue();
12713
12714 // Second is the constant, verify its correct.
12717
12718 // For the constant, we want to see all the even or all the odd.
12719 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12720 || C1->getZExtValue() != nextIndex+1)
12721 return SDValue();
12722
12723 // Increment index.
12724 nextIndex+=2;
12725 } else
12726 return SDValue();
12727 }
12728
12729 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12730 // we're using the entire input vector, otherwise there's a size/legality
12731 // mismatch somewhere.
12732 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12734 return SDValue();
12735
12736 // Create VPADDL node.
12737 SelectionDAG &DAG = DCI.DAG;
12738 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12739
12740 SDLoc dl(N);
12741
12742 // Build operand list.
12744 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12745 TLI.getPointerTy(DAG.getDataLayout())));
12746
12747 // Input is the vector.
12748 Ops.push_back(Vec);
12749
12750 // Get widened type and narrowed type.
12751 MVT widenType;
12752 unsigned numElem = VT.getVectorNumElements();
12753
12754 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12755 switch (inputLaneType.getSimpleVT().SimpleTy) {
12756 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12757 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12758 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12759 default:
12760 llvm_unreachable("Invalid vector element type for padd optimization.");
12761 }
12762
12763 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12764 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12765 return DAG.getNode(ExtOp, dl, VT, tmp);
12766}
12767
12769 if (V->getOpcode() == ISD::UMUL_LOHI ||
12770 V->getOpcode() == ISD::SMUL_LOHI)
12771 return V;
12772 return SDValue();
12773}
12774
12775static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12777 const ARMSubtarget *Subtarget) {
12778 if (!Subtarget->hasBaseDSP())
12779 return SDValue();
12780
12781 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12782 // accumulates the product into a 64-bit value. The 16-bit values will
12783 // be sign extended somehow or SRA'd into 32-bit values
12784 // (addc (adde (mul 16bit, 16bit), lo), hi)
12785 SDValue Mul = AddcNode->getOperand(0);
12786 SDValue Lo = AddcNode->getOperand(1);
12787 if (Mul.getOpcode() != ISD::MUL) {
12788 Lo = AddcNode->getOperand(0);
12789 Mul = AddcNode->getOperand(1);
12790 if (Mul.getOpcode() != ISD::MUL)
12791 return SDValue();
12792 }
12793
12794 SDValue SRA = AddeNode->getOperand(0);
12795 SDValue Hi = AddeNode->getOperand(1);
12796 if (SRA.getOpcode() != ISD::SRA) {
12797 SRA = AddeNode->getOperand(1);
12798 Hi = AddeNode->getOperand(0);
12799 if (SRA.getOpcode() != ISD::SRA)
12800 return SDValue();
12801 }
12802 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12803 if (Const->getZExtValue() != 31)
12804 return SDValue();
12805 } else
12806 return SDValue();
12807
12808 if (SRA.getOperand(0) != Mul)
12809 return SDValue();
12810
12811 SelectionDAG &DAG = DCI.DAG;
12812 SDLoc dl(AddcNode);
12813 unsigned Opcode = 0;
12814 SDValue Op0;
12815 SDValue Op1;
12816
12817 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12818 Opcode = ARMISD::SMLALBB;
12819 Op0 = Mul.getOperand(0);
12820 Op1 = Mul.getOperand(1);
12821 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12822 Opcode = ARMISD::SMLALBT;
12823 Op0 = Mul.getOperand(0);
12824 Op1 = Mul.getOperand(1).getOperand(0);
12825 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12826 Opcode = ARMISD::SMLALTB;
12827 Op0 = Mul.getOperand(0).getOperand(0);
12828 Op1 = Mul.getOperand(1);
12829 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12830 Opcode = ARMISD::SMLALTT;
12831 Op0 = Mul->getOperand(0).getOperand(0);
12832 Op1 = Mul->getOperand(1).getOperand(0);
12833 }
12834
12835 if (!Op0 || !Op1)
12836 return SDValue();
12837
12838 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12839 Op0, Op1, Lo, Hi);
12840 // Replace the ADDs' nodes uses by the MLA node's values.
12841 SDValue HiMLALResult(SMLAL.getNode(), 1);
12842 SDValue LoMLALResult(SMLAL.getNode(), 0);
12843
12844 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12845 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12846
12847 // Return original node to notify the driver to stop replacing.
12848 SDValue resNode(AddcNode, 0);
12849 return resNode;
12850}
12851
12854 const ARMSubtarget *Subtarget) {
12855 // Look for multiply add opportunities.
12856 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12857 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12858 // a glue link from the first add to the second add.
12859 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12860 // a S/UMLAL instruction.
12861 // UMUL_LOHI
12862 // / :lo \ :hi
12863 // V \ [no multiline comment]
12864 // loAdd -> ADDC |
12865 // \ :carry /
12866 // V V
12867 // ADDE <- hiAdd
12868 //
12869 // In the special case where only the higher part of a signed result is used
12870 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12871 // a constant with the exact value of 0x80000000, we recognize we are dealing
12872 // with a "rounded multiply and add" (or subtract) and transform it into
12873 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12874
12875 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12876 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12877 "Expect an ADDE or SUBE");
12878
12879 assert(AddeSubeNode->getNumOperands() == 3 &&
12880 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12881 "ADDE node has the wrong inputs");
12882
12883 // Check that we are chained to the right ADDC or SUBC node.
12884 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12885 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12886 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12887 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12888 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12889 return SDValue();
12890
12891 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12892 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12893
12894 // Check if the two operands are from the same mul_lohi node.
12895 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12896 return SDValue();
12897
12898 assert(AddcSubcNode->getNumValues() == 2 &&
12899 AddcSubcNode->getValueType(0) == MVT::i32 &&
12900 "Expect ADDC with two result values. First: i32");
12901
12902 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12903 // maybe a SMLAL which multiplies two 16-bit values.
12904 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12905 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12906 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12907 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12908 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12909 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12910
12911 // Check for the triangle shape.
12912 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12913 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12914
12915 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12916 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12917 return SDValue();
12918
12919 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12920 bool IsLeftOperandMUL = false;
12921 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12922 if (MULOp == SDValue())
12923 MULOp = findMUL_LOHI(AddeSubeOp1);
12924 else
12925 IsLeftOperandMUL = true;
12926 if (MULOp == SDValue())
12927 return SDValue();
12928
12929 // Figure out the right opcode.
12930 unsigned Opc = MULOp->getOpcode();
12931 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12932
12933 // Figure out the high and low input values to the MLAL node.
12934 SDValue *HiAddSub = nullptr;
12935 SDValue *LoMul = nullptr;
12936 SDValue *LowAddSub = nullptr;
12937
12938 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12939 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
12940 return SDValue();
12941
12942 if (IsLeftOperandMUL)
12943 HiAddSub = &AddeSubeOp1;
12944 else
12945 HiAddSub = &AddeSubeOp0;
12946
12947 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12948 // whose low result is fed to the ADDC/SUBC we are checking.
12949
12950 if (AddcSubcOp0 == MULOp.getValue(0)) {
12951 LoMul = &AddcSubcOp0;
12952 LowAddSub = &AddcSubcOp1;
12953 }
12954 if (AddcSubcOp1 == MULOp.getValue(0)) {
12955 LoMul = &AddcSubcOp1;
12956 LowAddSub = &AddcSubcOp0;
12957 }
12958
12959 if (!LoMul)
12960 return SDValue();
12961
12962 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12963 // the replacement below will create a cycle.
12964 if (AddcSubcNode == HiAddSub->getNode() ||
12965 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
12966 return SDValue();
12967
12968 // Create the merged node.
12969 SelectionDAG &DAG = DCI.DAG;
12970
12971 // Start building operand list.
12973 Ops.push_back(LoMul->getOperand(0));
12974 Ops.push_back(LoMul->getOperand(1));
12975
12976 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
12977 // the case, we must be doing signed multiplication and only use the higher
12978 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
12979 // addition or subtraction with the value of 0x800000.
12980 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
12981 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
12982 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
12983 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
12984 0x80000000) {
12985 Ops.push_back(*HiAddSub);
12986 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
12987 FinalOpc = ARMISD::SMMLSR;
12988 } else {
12989 FinalOpc = ARMISD::SMMLAR;
12990 }
12991 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
12992 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
12993
12994 return SDValue(AddeSubeNode, 0);
12995 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
12996 // SMMLS is generated during instruction selection and the rest of this
12997 // function can not handle the case where AddcSubcNode is a SUBC.
12998 return SDValue();
12999
13000 // Finish building the operand list for {U/S}MLAL
13001 Ops.push_back(*LowAddSub);
13002 Ops.push_back(*HiAddSub);
13003
13004 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
13005 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13006
13007 // Replace the ADDs' nodes uses by the MLA node's values.
13008 SDValue HiMLALResult(MLALNode.getNode(), 1);
13009 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
13010
13011 SDValue LoMLALResult(MLALNode.getNode(), 0);
13012 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
13013
13014 // Return original node to notify the driver to stop replacing.
13015 return SDValue(AddeSubeNode, 0);
13016}
13017
13020 const ARMSubtarget *Subtarget) {
13021 // UMAAL is similar to UMLAL except that it adds two unsigned values.
13022 // While trying to combine for the other MLAL nodes, first search for the
13023 // chance to use UMAAL. Check if Addc uses a node which has already
13024 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
13025 // as the addend, and it's handled in PerformUMLALCombine.
13026
13027 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13028 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13029
13030 // Check that we have a glued ADDC node.
13031 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
13032 if (AddcNode->getOpcode() != ARMISD::ADDC)
13033 return SDValue();
13034
13035 // Find the converted UMAAL or quit if it doesn't exist.
13036 SDNode *UmlalNode = nullptr;
13037 SDValue AddHi;
13038 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
13039 UmlalNode = AddcNode->getOperand(0).getNode();
13040 AddHi = AddcNode->getOperand(1);
13041 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
13042 UmlalNode = AddcNode->getOperand(1).getNode();
13043 AddHi = AddcNode->getOperand(0);
13044 } else {
13045 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13046 }
13047
13048 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
13049 // the ADDC as well as Zero.
13050 if (!isNullConstant(UmlalNode->getOperand(3)))
13051 return SDValue();
13052
13053 if ((isNullConstant(AddeNode->getOperand(0)) &&
13054 AddeNode->getOperand(1).getNode() == UmlalNode) ||
13055 (AddeNode->getOperand(0).getNode() == UmlalNode &&
13056 isNullConstant(AddeNode->getOperand(1)))) {
13057 SelectionDAG &DAG = DCI.DAG;
13058 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
13059 UmlalNode->getOperand(2), AddHi };
13060 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
13061 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13062
13063 // Replace the ADDs' nodes uses by the UMAAL node's values.
13064 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
13065 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
13066
13067 // Return original node to notify the driver to stop replacing.
13068 return SDValue(AddeNode, 0);
13069 }
13070 return SDValue();
13071}
13072
13074 const ARMSubtarget *Subtarget) {
13075 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13076 return SDValue();
13077
13078 // Check that we have a pair of ADDC and ADDE as operands.
13079 // Both addends of the ADDE must be zero.
13080 SDNode* AddcNode = N->getOperand(2).getNode();
13081 SDNode* AddeNode = N->getOperand(3).getNode();
13082 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
13083 (AddeNode->getOpcode() == ARMISD::ADDE) &&
13084 isNullConstant(AddeNode->getOperand(0)) &&
13085 isNullConstant(AddeNode->getOperand(1)) &&
13086 (AddeNode->getOperand(2).getNode() == AddcNode))
13087 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
13088 DAG.getVTList(MVT::i32, MVT::i32),
13089 {N->getOperand(0), N->getOperand(1),
13090 AddcNode->getOperand(0), AddcNode->getOperand(1)});
13091 else
13092 return SDValue();
13093}
13094
13097 const ARMSubtarget *Subtarget) {
13098 SelectionDAG &DAG(DCI.DAG);
13099
13100 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
13101 // (SUBC (ADDE 0, 0, C), 1) -> C
13102 SDValue LHS = N->getOperand(0);
13103 SDValue RHS = N->getOperand(1);
13104 if (LHS->getOpcode() == ARMISD::ADDE &&
13105 isNullConstant(LHS->getOperand(0)) &&
13106 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
13107 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
13108 }
13109 }
13110
13111 if (Subtarget->isThumb1Only()) {
13112 SDValue RHS = N->getOperand(1);
13114 int32_t imm = C->getSExtValue();
13115 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
13116 SDLoc DL(N);
13117 RHS = DAG.getConstant(-imm, DL, MVT::i32);
13118 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
13119 : ARMISD::ADDC;
13120 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
13121 }
13122 }
13123 }
13124
13125 return SDValue();
13126}
13127
13130 const ARMSubtarget *Subtarget) {
13131 if (Subtarget->isThumb1Only()) {
13132 SelectionDAG &DAG = DCI.DAG;
13133 SDValue RHS = N->getOperand(1);
13135 int64_t imm = C->getSExtValue();
13136 if (imm < 0) {
13137 SDLoc DL(N);
13138
13139 // The with-carry-in form matches bitwise not instead of the negation.
13140 // Effectively, the inverse interpretation of the carry flag already
13141 // accounts for part of the negation.
13142 RHS = DAG.getConstant(~imm, DL, MVT::i32);
13143
13144 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
13145 : ARMISD::ADDE;
13146 return DAG.getNode(Opcode, DL, N->getVTList(),
13147 N->getOperand(0), RHS, N->getOperand(2));
13148 }
13149 }
13150 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
13151 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
13152 }
13153 return SDValue();
13154}
13155
13158 const ARMSubtarget *Subtarget) {
13159 if (!Subtarget->hasMVEIntegerOps())
13160 return SDValue();
13161
13162 SDLoc dl(N);
13163 SDValue SetCC;
13164 SDValue LHS;
13165 SDValue RHS;
13166 ISD::CondCode CC;
13167 SDValue TrueVal;
13168 SDValue FalseVal;
13169
13170 if (N->getOpcode() == ISD::SELECT &&
13171 N->getOperand(0)->getOpcode() == ISD::SETCC) {
13172 SetCC = N->getOperand(0);
13173 LHS = SetCC->getOperand(0);
13174 RHS = SetCC->getOperand(1);
13175 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
13176 TrueVal = N->getOperand(1);
13177 FalseVal = N->getOperand(2);
13178 } else if (N->getOpcode() == ISD::SELECT_CC) {
13179 LHS = N->getOperand(0);
13180 RHS = N->getOperand(1);
13181 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
13182 TrueVal = N->getOperand(2);
13183 FalseVal = N->getOperand(3);
13184 } else {
13185 return SDValue();
13186 }
13187
13188 unsigned int Opcode = 0;
13189 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
13190 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
13191 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
13192 Opcode = ARMISD::VMINVu;
13193 if (CC == ISD::SETUGT)
13194 std::swap(TrueVal, FalseVal);
13195 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
13196 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
13197 (CC == ISD::SETLT || CC == ISD::SETGT)) {
13198 Opcode = ARMISD::VMINVs;
13199 if (CC == ISD::SETGT)
13200 std::swap(TrueVal, FalseVal);
13201 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
13202 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13203 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13204 Opcode = ARMISD::VMAXVu;
13205 if (CC == ISD::SETULT)
13206 std::swap(TrueVal, FalseVal);
13207 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13208 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13209 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13210 Opcode = ARMISD::VMAXVs;
13211 if (CC == ISD::SETLT)
13212 std::swap(TrueVal, FalseVal);
13213 } else
13214 return SDValue();
13215
13216 // Normalise to the right hand side being the vector reduction
13217 switch (TrueVal->getOpcode()) {
13218 case ISD::VECREDUCE_UMIN:
13219 case ISD::VECREDUCE_SMIN:
13220 case ISD::VECREDUCE_UMAX:
13221 case ISD::VECREDUCE_SMAX:
13222 std::swap(LHS, RHS);
13223 std::swap(TrueVal, FalseVal);
13224 break;
13225 }
13226
13227 EVT VectorType = FalseVal->getOperand(0).getValueType();
13228
13229 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13230 VectorType != MVT::v4i32)
13231 return SDValue();
13232
13233 EVT VectorScalarType = VectorType.getVectorElementType();
13234
13235 // The values being selected must also be the ones being compared
13236 if (TrueVal != LHS || FalseVal != RHS)
13237 return SDValue();
13238
13239 EVT LeftType = LHS->getValueType(0);
13240 EVT RightType = RHS->getValueType(0);
13241
13242 // The types must match the reduced type too
13243 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13244 return SDValue();
13245
13246 // Legalise the scalar to an i32
13247 if (VectorScalarType != MVT::i32)
13248 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13249
13250 // Generate the reduction as an i32 for legalisation purposes
13251 auto Reduction =
13252 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13253
13254 // The result isn't actually an i32 so truncate it back to its original type
13255 if (VectorScalarType != MVT::i32)
13256 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13257
13258 return Reduction;
13259}
13260
13261// A special combine for the vqdmulh family of instructions. This is one of the
13262// potential set of patterns that could patch this instruction. The base pattern
13263// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13264// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13265// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13266// the max is unnecessary.
13268 EVT VT = N->getValueType(0);
13269 SDValue Shft;
13270 ConstantSDNode *Clamp;
13271
13272 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13273 return SDValue();
13274
13275 if (N->getOpcode() == ISD::SMIN) {
13276 Shft = N->getOperand(0);
13277 Clamp = isConstOrConstSplat(N->getOperand(1));
13278 } else if (N->getOpcode() == ISD::VSELECT) {
13279 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13280 SDValue Cmp = N->getOperand(0);
13281 if (Cmp.getOpcode() != ISD::SETCC ||
13282 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13283 Cmp.getOperand(0) != N->getOperand(1) ||
13284 Cmp.getOperand(1) != N->getOperand(2))
13285 return SDValue();
13286 Shft = N->getOperand(1);
13287 Clamp = isConstOrConstSplat(N->getOperand(2));
13288 } else
13289 return SDValue();
13290
13291 if (!Clamp)
13292 return SDValue();
13293
13294 MVT ScalarType;
13295 int ShftAmt = 0;
13296 switch (Clamp->getSExtValue()) {
13297 case (1 << 7) - 1:
13298 ScalarType = MVT::i8;
13299 ShftAmt = 7;
13300 break;
13301 case (1 << 15) - 1:
13302 ScalarType = MVT::i16;
13303 ShftAmt = 15;
13304 break;
13305 case (1ULL << 31) - 1:
13306 ScalarType = MVT::i32;
13307 ShftAmt = 31;
13308 break;
13309 default:
13310 return SDValue();
13311 }
13312
13313 if (Shft.getOpcode() != ISD::SRA)
13314 return SDValue();
13316 if (!N1 || N1->getSExtValue() != ShftAmt)
13317 return SDValue();
13318
13319 SDValue Mul = Shft.getOperand(0);
13320 if (Mul.getOpcode() != ISD::MUL)
13321 return SDValue();
13322
13323 SDValue Ext0 = Mul.getOperand(0);
13324 SDValue Ext1 = Mul.getOperand(1);
13325 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13326 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13327 return SDValue();
13328 EVT VecVT = Ext0.getOperand(0).getValueType();
13329 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13330 return SDValue();
13331 if (Ext1.getOperand(0).getValueType() != VecVT ||
13332 VecVT.getScalarType() != ScalarType ||
13333 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13334 return SDValue();
13335
13336 SDLoc DL(Mul);
13337 unsigned LegalLanes = 128 / (ShftAmt + 1);
13338 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13339 // For types smaller than legal vectors extend to be legal and only use needed
13340 // lanes.
13341 if (VecVT.getSizeInBits() < 128) {
13342 EVT ExtVecVT =
13344 VecVT.getVectorNumElements());
13345 SDValue Inp0 =
13346 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13347 SDValue Inp1 =
13348 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13349 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13350 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13351 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13352 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13353 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13354 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13355 }
13356
13357 // For larger types, split into legal sized chunks.
13358 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13359 unsigned NumParts = VecVT.getSizeInBits() / 128;
13361 for (unsigned I = 0; I < NumParts; ++I) {
13362 SDValue Inp0 =
13363 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13364 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13365 SDValue Inp1 =
13366 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13367 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13368 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13369 Parts.push_back(VQDMULH);
13370 }
13371 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13372 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13373}
13374
13377 const ARMSubtarget *Subtarget) {
13378 if (!Subtarget->hasMVEIntegerOps())
13379 return SDValue();
13380
13381 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13382 return V;
13383
13384 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13385 //
13386 // We need to re-implement this optimization here as the implementation in the
13387 // Target-Independent DAGCombiner does not handle the kind of constant we make
13388 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13389 // good reason, allowing truncation there would break other targets).
13390 //
13391 // Currently, this is only done for MVE, as it's the only target that benefits
13392 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13393 if (N->getOperand(0).getOpcode() != ISD::XOR)
13394 return SDValue();
13395 SDValue XOR = N->getOperand(0);
13396
13397 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13398 // It is important to check with truncation allowed as the BUILD_VECTORs we
13399 // generate in those situations will truncate their operands.
13400 ConstantSDNode *Const =
13401 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13402 /*AllowTruncation*/ true);
13403 if (!Const || !Const->isOne())
13404 return SDValue();
13405
13406 // Rewrite into vselect(cond, rhs, lhs).
13407 SDValue Cond = XOR->getOperand(0);
13408 SDValue LHS = N->getOperand(1);
13409 SDValue RHS = N->getOperand(2);
13410 EVT Type = N->getValueType(0);
13411 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13412}
13413
13414// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13417 const ARMSubtarget *Subtarget) {
13418 SDValue Op0 = N->getOperand(0);
13419 SDValue Op1 = N->getOperand(1);
13420 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13421 EVT VT = N->getValueType(0);
13422
13423 if (!Subtarget->hasMVEIntegerOps() ||
13425 return SDValue();
13426
13427 if (CC == ISD::SETUGE) {
13428 std::swap(Op0, Op1);
13429 CC = ISD::SETULT;
13430 }
13431
13432 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13434 return SDValue();
13435
13436 // Check first operand is BuildVector of 0,1,2,...
13437 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13438 if (!Op0.getOperand(I).isUndef() &&
13440 Op0.getConstantOperandVal(I) == I))
13441 return SDValue();
13442 }
13443
13444 // The second is a Splat of Op1S
13445 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13446 if (!Op1S)
13447 return SDValue();
13448
13449 unsigned Opc;
13450 switch (VT.getVectorNumElements()) {
13451 case 2:
13452 Opc = Intrinsic::arm_mve_vctp64;
13453 break;
13454 case 4:
13455 Opc = Intrinsic::arm_mve_vctp32;
13456 break;
13457 case 8:
13458 Opc = Intrinsic::arm_mve_vctp16;
13459 break;
13460 case 16:
13461 Opc = Intrinsic::arm_mve_vctp8;
13462 break;
13463 default:
13464 return SDValue();
13465 }
13466
13467 SDLoc DL(N);
13468 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13469 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13470 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13471}
13472
13473/// PerformADDECombine - Target-specific dag combine transform from
13474/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13475/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13478 const ARMSubtarget *Subtarget) {
13479 // Only ARM and Thumb2 support UMLAL/SMLAL.
13480 if (Subtarget->isThumb1Only())
13481 return PerformAddeSubeCombine(N, DCI, Subtarget);
13482
13483 // Only perform the checks after legalize when the pattern is available.
13484 if (DCI.isBeforeLegalize()) return SDValue();
13485
13486 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13487}
13488
13489/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13490/// operands N0 and N1. This is a helper for PerformADDCombine that is
13491/// called with the default operands, and if that fails, with commuted
13492/// operands.
13495 const ARMSubtarget *Subtarget){
13496 // Attempt to create vpadd for this add.
13497 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13498 return Result;
13499
13500 // Attempt to create vpaddl for this add.
13501 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13502 return Result;
13503 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13504 Subtarget))
13505 return Result;
13506
13507 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13508 if (N0.getNode()->hasOneUse())
13509 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13510 return Result;
13511 return SDValue();
13512}
13513
13515 EVT VT = N->getValueType(0);
13516 SDValue N0 = N->getOperand(0);
13517 SDValue N1 = N->getOperand(1);
13518 SDLoc dl(N);
13519
13520 auto IsVecReduce = [](SDValue Op) {
13521 switch (Op.getOpcode()) {
13522 case ISD::VECREDUCE_ADD:
13523 case ARMISD::VADDVs:
13524 case ARMISD::VADDVu:
13525 case ARMISD::VMLAVs:
13526 case ARMISD::VMLAVu:
13527 return true;
13528 }
13529 return false;
13530 };
13531
13532 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13533 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13534 // add(add(X, vecreduce(Y)), vecreduce(Z))
13535 // to make better use of vaddva style instructions.
13536 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13537 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13538 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13539 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13540 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13541 }
13542 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13543 // add(add(add(A, C), reduce(B)), reduce(D))
13544 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13545 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13546 unsigned N0RedOp = 0;
13547 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13548 N0RedOp = 1;
13549 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13550 return SDValue();
13551 }
13552
13553 unsigned N1RedOp = 0;
13554 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13555 N1RedOp = 1;
13556 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13557 return SDValue();
13558
13559 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13560 N1.getOperand(1 - N1RedOp));
13561 SDValue Add1 =
13562 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13563 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13564 }
13565 return SDValue();
13566 };
13567 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13568 return R;
13569 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13570 return R;
13571
13572 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13573 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13574 // by ascending load offsets. This can help cores prefetch if the order of
13575 // loads is more predictable.
13576 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13577 // Check if two reductions are known to load data where one is before/after
13578 // another. Return negative if N0 loads data before N1, positive if N1 is
13579 // before N0 and 0 otherwise if nothing is known.
13580 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13581 // Look through to the first operand of a MUL, for the VMLA case.
13582 // Currently only looks at the first operand, in the hope they are equal.
13583 if (N0.getOpcode() == ISD::MUL)
13584 N0 = N0.getOperand(0);
13585 if (N1.getOpcode() == ISD::MUL)
13586 N1 = N1.getOperand(0);
13587
13588 // Return true if the two operands are loads to the same object and the
13589 // offset of the first is known to be less than the offset of the second.
13590 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13591 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13592 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13593 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13594 Load1->isIndexed())
13595 return 0;
13596
13597 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13598 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13599
13600 if (!BaseLocDecomp0.getBase() ||
13601 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13602 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13603 return 0;
13604 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13605 return -1;
13606 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13607 return 1;
13608 return 0;
13609 };
13610
13611 SDValue X;
13612 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13613 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13614 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13615 N0.getOperand(1).getOperand(0));
13616 if (IsBefore < 0) {
13617 X = N0.getOperand(0);
13618 N0 = N0.getOperand(1);
13619 } else if (IsBefore > 0) {
13620 X = N0.getOperand(1);
13621 N0 = N0.getOperand(0);
13622 } else
13623 return SDValue();
13624 } else if (IsVecReduce(N0.getOperand(0))) {
13625 X = N0.getOperand(1);
13626 N0 = N0.getOperand(0);
13627 } else if (IsVecReduce(N0.getOperand(1))) {
13628 X = N0.getOperand(0);
13629 N0 = N0.getOperand(1);
13630 } else
13631 return SDValue();
13632 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13633 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13634 // Note this is backward to how you would expect. We create
13635 // add(reduce(load + 16), reduce(load + 0)) so that the
13636 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13637 // the X as VADDV(load + 0)
13638 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13639 } else
13640 return SDValue();
13641
13642 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13643 return SDValue();
13644
13645 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13646 return SDValue();
13647
13648 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13649 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13650 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13651 };
13652 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13653 return R;
13654 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13655 return R;
13656 return SDValue();
13657}
13658
13660 const ARMSubtarget *Subtarget) {
13661 if (!Subtarget->hasMVEIntegerOps())
13662 return SDValue();
13663
13665 return R;
13666
13667 EVT VT = N->getValueType(0);
13668 SDValue N0 = N->getOperand(0);
13669 SDValue N1 = N->getOperand(1);
13670 SDLoc dl(N);
13671
13672 if (VT != MVT::i64)
13673 return SDValue();
13674
13675 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13676 // will look like:
13677 // t1: i32,i32 = ARMISD::VADDLVs x
13678 // t2: i64 = build_pair t1, t1:1
13679 // t3: i64 = add t2, y
13680 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13681 // the add to be simplified separately.
13682 // We also need to check for sext / zext and commutitive adds.
13683 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13684 SDValue NB) {
13685 if (NB->getOpcode() != ISD::BUILD_PAIR)
13686 return SDValue();
13687 SDValue VecRed = NB->getOperand(0);
13688 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13689 VecRed.getResNo() != 0 ||
13690 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13691 return SDValue();
13692
13693 if (VecRed->getOpcode() == OpcodeA) {
13694 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13695 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13696 VecRed.getOperand(0), VecRed.getOperand(1));
13697 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13698 }
13699
13701 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13702
13703 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13704 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13705 Ops.push_back(VecRed->getOperand(I));
13706 SDValue Red =
13707 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13708 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13709 SDValue(Red.getNode(), 1));
13710 };
13711
13712 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13713 return M;
13714 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13715 return M;
13716 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13717 return M;
13718 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13719 return M;
13720 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13721 return M;
13722 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13723 return M;
13724 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13725 return M;
13726 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13727 return M;
13728 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13729 return M;
13730 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13731 return M;
13732 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13733 return M;
13734 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13735 return M;
13736 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13737 return M;
13738 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13739 return M;
13740 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13741 return M;
13742 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13743 return M;
13744 return SDValue();
13745}
13746
13747bool
13749 CombineLevel Level) const {
13750 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13751 N->getOpcode() == ISD::SRL) &&
13752 "Expected shift op");
13753
13754 SDValue ShiftLHS = N->getOperand(0);
13755 if (!ShiftLHS->hasOneUse())
13756 return false;
13757
13758 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
13759 !ShiftLHS.getOperand(0)->hasOneUse())
13760 return false;
13761
13762 if (Level == BeforeLegalizeTypes)
13763 return true;
13764
13765 if (N->getOpcode() != ISD::SHL)
13766 return true;
13767
13768 if (Subtarget->isThumb1Only()) {
13769 // Avoid making expensive immediates by commuting shifts. (This logic
13770 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13771 // for free.)
13772 if (N->getOpcode() != ISD::SHL)
13773 return true;
13774 SDValue N1 = N->getOperand(0);
13775 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13776 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13777 return true;
13778 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13779 if (Const->getAPIntValue().ult(256))
13780 return false;
13781 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13782 Const->getAPIntValue().sgt(-256))
13783 return false;
13784 }
13785 return true;
13786 }
13787
13788 // Turn off commute-with-shift transform after legalization, so it doesn't
13789 // conflict with PerformSHLSimplify. (We could try to detect when
13790 // PerformSHLSimplify would trigger more precisely, but it isn't
13791 // really necessary.)
13792 return false;
13793}
13794
13796 const SDNode *N) const {
13797 assert(N->getOpcode() == ISD::XOR &&
13798 (N->getOperand(0).getOpcode() == ISD::SHL ||
13799 N->getOperand(0).getOpcode() == ISD::SRL) &&
13800 "Expected XOR(SHIFT) pattern");
13801
13802 // Only commute if the entire NOT mask is a hidden shifted mask.
13803 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13804 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13805 if (XorC && ShiftC) {
13806 unsigned MaskIdx, MaskLen;
13807 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13808 unsigned ShiftAmt = ShiftC->getZExtValue();
13809 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13810 if (N->getOperand(0).getOpcode() == ISD::SHL)
13811 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13812 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13813 }
13814 }
13815
13816 return false;
13817}
13818
13820 const SDNode *N, CombineLevel Level) const {
13821 assert(((N->getOpcode() == ISD::SHL &&
13822 N->getOperand(0).getOpcode() == ISD::SRL) ||
13823 (N->getOpcode() == ISD::SRL &&
13824 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13825 "Expected shift-shift mask");
13826
13827 if (!Subtarget->isThumb1Only())
13828 return true;
13829
13830 if (Level == BeforeLegalizeTypes)
13831 return true;
13832
13833 return false;
13834}
13835
13837 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
13838 SDValue Y) const {
13839 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT) &&
13840 SelectOpcode == ISD::VSELECT;
13841}
13842
13844 if (!Subtarget->hasNEON()) {
13845 if (Subtarget->isThumb1Only())
13846 return VT.getScalarSizeInBits() <= 32;
13847 return true;
13848 }
13849 return VT.isScalarInteger();
13850}
13851
13853 EVT VT) const {
13854 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13855 return false;
13856
13857 switch (FPVT.getSimpleVT().SimpleTy) {
13858 case MVT::f16:
13859 return Subtarget->hasVFP2Base();
13860 case MVT::f32:
13861 return Subtarget->hasVFP2Base();
13862 case MVT::f64:
13863 return Subtarget->hasFP64();
13864 case MVT::v4f32:
13865 case MVT::v8f16:
13866 return Subtarget->hasMVEFloatOps();
13867 default:
13868 return false;
13869 }
13870}
13871
13874 const ARMSubtarget *ST) {
13875 // Allow the generic combiner to identify potential bswaps.
13876 if (DCI.isBeforeLegalize())
13877 return SDValue();
13878
13879 // DAG combiner will fold:
13880 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13881 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13882 // Other code patterns that can be also be modified have the following form:
13883 // b + ((a << 1) | 510)
13884 // b + ((a << 1) & 510)
13885 // b + ((a << 1) ^ 510)
13886 // b + ((a << 1) + 510)
13887
13888 // Many instructions can perform the shift for free, but it requires both
13889 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13890 // instruction will needed. So, unfold back to the original pattern if:
13891 // - if c1 and c2 are small enough that they don't require mov imms.
13892 // - the user(s) of the node can perform an shl
13893
13894 // No shifted operands for 16-bit instructions.
13895 if (ST->isThumb() && ST->isThumb1Only())
13896 return SDValue();
13897
13898 // Check that all the users could perform the shl themselves.
13899 for (auto *U : N->users()) {
13900 switch(U->getOpcode()) {
13901 default:
13902 return SDValue();
13903 case ISD::SUB:
13904 case ISD::ADD:
13905 case ISD::AND:
13906 case ISD::OR:
13907 case ISD::XOR:
13908 case ISD::SETCC:
13909 case ARMISD::CMP:
13910 // Check that the user isn't already using a constant because there
13911 // aren't any instructions that support an immediate operand and a
13912 // shifted operand.
13913 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13914 isa<ConstantSDNode>(U->getOperand(1)))
13915 return SDValue();
13916
13917 // Check that it's not already using a shift.
13918 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13919 U->getOperand(1).getOpcode() == ISD::SHL)
13920 return SDValue();
13921 break;
13922 }
13923 }
13924
13925 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13926 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13927 return SDValue();
13928
13929 if (N->getOperand(0).getOpcode() != ISD::SHL)
13930 return SDValue();
13931
13932 SDValue SHL = N->getOperand(0);
13933
13934 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13935 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13936 if (!C1ShlC2 || !C2)
13937 return SDValue();
13938
13939 APInt C2Int = C2->getAPIntValue();
13940 APInt C1Int = C1ShlC2->getAPIntValue();
13941 unsigned C2Width = C2Int.getBitWidth();
13942 if (C2Int.uge(C2Width))
13943 return SDValue();
13944 uint64_t C2Value = C2Int.getZExtValue();
13945
13946 // Check that performing a lshr will not lose any information.
13947 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
13948 if ((C1Int & Mask) != C1Int)
13949 return SDValue();
13950
13951 // Shift the first constant.
13952 C1Int.lshrInPlace(C2Int);
13953
13954 // The immediates are encoded as an 8-bit value that can be rotated.
13955 auto LargeImm = [](const APInt &Imm) {
13956 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
13957 return Imm.getBitWidth() - Zeros > 8;
13958 };
13959
13960 if (LargeImm(C1Int) || LargeImm(C2Int))
13961 return SDValue();
13962
13963 SelectionDAG &DAG = DCI.DAG;
13964 SDLoc dl(N);
13965 SDValue X = SHL.getOperand(0);
13966 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
13967 DAG.getConstant(C1Int, dl, MVT::i32));
13968 // Shift left to compensate for the lshr of C1Int.
13969 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
13970
13971 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
13972 SHL.dump(); N->dump());
13973 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
13974 return Res;
13975}
13976
13977
13978/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
13979///
13982 const ARMSubtarget *Subtarget) {
13983 SDValue N0 = N->getOperand(0);
13984 SDValue N1 = N->getOperand(1);
13985
13986 // Only works one way, because it needs an immediate operand.
13987 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13988 return Result;
13989
13990 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
13991 return Result;
13992
13993 // First try with the default operand order.
13994 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
13995 return Result;
13996
13997 // If that didn't work, try again with the operands commuted.
13998 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
13999}
14000
14001// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
14002// providing -X is as cheap as X (currently, just a constant).
14004 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
14005 return SDValue();
14006 SDValue CSINC = N->getOperand(1);
14007 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
14008 return SDValue();
14009
14010 ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0));
14011 if (!X)
14012 return SDValue();
14013
14014 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
14015 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
14016 CSINC.getOperand(0)),
14017 CSINC.getOperand(1), CSINC.getOperand(2),
14018 CSINC.getOperand(3));
14019}
14020
14022 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
14023}
14024
14025// Try to fold
14026//
14027// (neg (cmov X, Y)) -> (cmov (neg X), (neg Y))
14028//
14029// The folding helps cmov to be matched with csneg without generating
14030// redundant neg instruction.
14032 if (!isNegatedInteger(SDValue(N, 0)))
14033 return SDValue();
14034
14035 SDValue CMov = N->getOperand(1);
14036 if (CMov.getOpcode() != ARMISD::CMOV || !CMov->hasOneUse())
14037 return SDValue();
14038
14039 SDValue N0 = CMov.getOperand(0);
14040 SDValue N1 = CMov.getOperand(1);
14041
14042 // If neither of them are negations, it's not worth the folding as it
14043 // introduces two additional negations while reducing one negation.
14044 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
14045 return SDValue();
14046
14047 SDLoc DL(N);
14048 EVT VT = CMov.getValueType();
14049
14050 SDValue N0N = DAG.getNegative(N0, DL, VT);
14051 SDValue N1N = DAG.getNegative(N1, DL, VT);
14052 return DAG.getNode(ARMISD::CMOV, DL, VT, N0N, N1N, CMov.getOperand(2),
14053 CMov.getOperand(3));
14054}
14055
14056/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
14057///
14060 const ARMSubtarget *Subtarget) {
14061 SDValue N0 = N->getOperand(0);
14062 SDValue N1 = N->getOperand(1);
14063
14064 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
14065 if (N1.getNode()->hasOneUse())
14066 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
14067 return Result;
14068
14069 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
14070 return R;
14071
14072 if (SDValue Val = performNegCMovCombine(N, DCI.DAG))
14073 return Val;
14074
14075 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
14076 return SDValue();
14077
14078 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
14079 // so that we can readily pattern match more mve instructions which can use
14080 // a scalar operand.
14081 SDValue VDup = N->getOperand(1);
14082 if (VDup->getOpcode() != ARMISD::VDUP)
14083 return SDValue();
14084
14085 SDValue VMov = N->getOperand(0);
14086 if (VMov->getOpcode() == ISD::BITCAST)
14087 VMov = VMov->getOperand(0);
14088
14089 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
14090 return SDValue();
14091
14092 SDLoc dl(N);
14093 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
14094 DCI.DAG.getConstant(0, dl, MVT::i32),
14095 VDup->getOperand(0));
14096 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
14097}
14098
14099/// PerformVMULCombine
14100/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
14101/// special multiplier accumulator forwarding.
14102/// vmul d3, d0, d2
14103/// vmla d3, d1, d2
14104/// is faster than
14105/// vadd d3, d0, d1
14106/// vmul d3, d3, d2
14107// However, for (A + B) * (A + B),
14108// vadd d2, d0, d1
14109// vmul d3, d0, d2
14110// vmla d3, d1, d2
14111// is slower than
14112// vadd d2, d0, d1
14113// vmul d3, d2, d2
14116 const ARMSubtarget *Subtarget) {
14117 if (!Subtarget->hasVMLxForwarding())
14118 return SDValue();
14119
14120 SelectionDAG &DAG = DCI.DAG;
14121 SDValue N0 = N->getOperand(0);
14122 SDValue N1 = N->getOperand(1);
14123 unsigned Opcode = N0.getOpcode();
14124 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14125 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
14126 Opcode = N1.getOpcode();
14127 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14128 Opcode != ISD::FADD && Opcode != ISD::FSUB)
14129 return SDValue();
14130 std::swap(N0, N1);
14131 }
14132
14133 if (N0 == N1)
14134 return SDValue();
14135
14136 EVT VT = N->getValueType(0);
14137 SDLoc DL(N);
14138 SDValue N00 = N0->getOperand(0);
14139 SDValue N01 = N0->getOperand(1);
14140 return DAG.getNode(Opcode, DL, VT,
14141 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
14142 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
14143}
14144
14146 const ARMSubtarget *Subtarget) {
14147 EVT VT = N->getValueType(0);
14148 if (VT != MVT::v2i64)
14149 return SDValue();
14150
14151 SDValue N0 = N->getOperand(0);
14152 SDValue N1 = N->getOperand(1);
14153
14154 auto IsSignExt = [&](SDValue Op) {
14155 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
14156 return SDValue();
14157 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
14158 if (VT.getScalarSizeInBits() == 32)
14159 return Op->getOperand(0);
14160 return SDValue();
14161 };
14162 auto IsZeroExt = [&](SDValue Op) {
14163 // Zero extends are a little more awkward. At the point we are matching
14164 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
14165 // That might be before of after a bitcast depending on how the and is
14166 // placed. Because this has to look through bitcasts, it is currently only
14167 // supported on LE.
14168 if (!Subtarget->isLittle())
14169 return SDValue();
14170
14171 SDValue And = Op;
14172 if (And->getOpcode() == ISD::BITCAST)
14173 And = And->getOperand(0);
14174 if (And->getOpcode() != ISD::AND)
14175 return SDValue();
14176 SDValue Mask = And->getOperand(1);
14177 if (Mask->getOpcode() == ISD::BITCAST)
14178 Mask = Mask->getOperand(0);
14179
14180 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
14181 Mask.getValueType() != MVT::v4i32)
14182 return SDValue();
14183 if (isAllOnesConstant(Mask->getOperand(0)) &&
14184 isNullConstant(Mask->getOperand(1)) &&
14185 isAllOnesConstant(Mask->getOperand(2)) &&
14186 isNullConstant(Mask->getOperand(3)))
14187 return And->getOperand(0);
14188 return SDValue();
14189 };
14190
14191 SDLoc dl(N);
14192 if (SDValue Op0 = IsSignExt(N0)) {
14193 if (SDValue Op1 = IsSignExt(N1)) {
14194 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14195 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14196 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
14197 }
14198 }
14199 if (SDValue Op0 = IsZeroExt(N0)) {
14200 if (SDValue Op1 = IsZeroExt(N1)) {
14201 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14202 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14203 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14204 }
14205 }
14206
14207 return SDValue();
14208}
14209
14212 const ARMSubtarget *Subtarget) {
14213 SelectionDAG &DAG = DCI.DAG;
14214
14215 EVT VT = N->getValueType(0);
14216 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14217 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14218
14219 if (Subtarget->isThumb1Only())
14220 return SDValue();
14221
14222 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14223 return SDValue();
14224
14225 if (VT.is64BitVector() || VT.is128BitVector())
14226 return PerformVMULCombine(N, DCI, Subtarget);
14227 if (VT != MVT::i32)
14228 return SDValue();
14229
14230 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14231 if (!C)
14232 return SDValue();
14233
14234 int64_t MulAmt = C->getSExtValue();
14235 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14236
14237 ShiftAmt = ShiftAmt & (32 - 1);
14238 SDValue V = N->getOperand(0);
14239 SDLoc DL(N);
14240
14241 SDValue Res;
14242 MulAmt >>= ShiftAmt;
14243
14244 if (MulAmt >= 0) {
14245 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14246 // (mul x, 2^N + 1) => (add (shl x, N), x)
14247 Res = DAG.getNode(ISD::ADD, DL, VT,
14248 V,
14249 DAG.getNode(ISD::SHL, DL, VT,
14250 V,
14251 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14252 MVT::i32)));
14253 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14254 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14255 Res = DAG.getNode(ISD::SUB, DL, VT,
14256 DAG.getNode(ISD::SHL, DL, VT,
14257 V,
14258 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14259 MVT::i32)),
14260 V);
14261 } else
14262 return SDValue();
14263 } else {
14264 uint64_t MulAmtAbs = -MulAmt;
14265 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14266 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14267 Res = DAG.getNode(ISD::SUB, DL, VT,
14268 V,
14269 DAG.getNode(ISD::SHL, DL, VT,
14270 V,
14271 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14272 MVT::i32)));
14273 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14274 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14275 Res = DAG.getNode(ISD::ADD, DL, VT,
14276 V,
14277 DAG.getNode(ISD::SHL, DL, VT,
14278 V,
14279 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14280 MVT::i32)));
14281 Res = DAG.getNode(ISD::SUB, DL, VT,
14282 DAG.getConstant(0, DL, MVT::i32), Res);
14283 } else
14284 return SDValue();
14285 }
14286
14287 if (ShiftAmt != 0)
14288 Res = DAG.getNode(ISD::SHL, DL, VT,
14289 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14290
14291 // Do not add new nodes to DAG combiner worklist.
14292 DCI.CombineTo(N, Res, false);
14293 return SDValue();
14294}
14295
14298 const ARMSubtarget *Subtarget) {
14299 // Allow DAGCombine to pattern-match before we touch the canonical form.
14300 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14301 return SDValue();
14302
14303 if (N->getValueType(0) != MVT::i32)
14304 return SDValue();
14305
14306 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14307 if (!N1C)
14308 return SDValue();
14309
14310 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14311 // Don't transform uxtb/uxth.
14312 if (C1 == 255 || C1 == 65535)
14313 return SDValue();
14314
14315 SDNode *N0 = N->getOperand(0).getNode();
14316 if (!N0->hasOneUse())
14317 return SDValue();
14318
14319 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14320 return SDValue();
14321
14322 bool LeftShift = N0->getOpcode() == ISD::SHL;
14323
14325 if (!N01C)
14326 return SDValue();
14327
14328 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14329 if (!C2 || C2 >= 32)
14330 return SDValue();
14331
14332 // Clear irrelevant bits in the mask.
14333 if (LeftShift)
14334 C1 &= (-1U << C2);
14335 else
14336 C1 &= (-1U >> C2);
14337
14338 SelectionDAG &DAG = DCI.DAG;
14339 SDLoc DL(N);
14340
14341 // We have a pattern of the form "(and (shl x, c2) c1)" or
14342 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14343 // transform to a pair of shifts, to save materializing c1.
14344
14345 // First pattern: right shift, then mask off leading bits.
14346 // FIXME: Use demanded bits?
14347 if (!LeftShift && isMask_32(C1)) {
14348 uint32_t C3 = llvm::countl_zero(C1);
14349 if (C2 < C3) {
14350 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14351 DAG.getConstant(C3 - C2, DL, MVT::i32));
14352 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14353 DAG.getConstant(C3, DL, MVT::i32));
14354 }
14355 }
14356
14357 // First pattern, reversed: left shift, then mask off trailing bits.
14358 if (LeftShift && isMask_32(~C1)) {
14359 uint32_t C3 = llvm::countr_zero(C1);
14360 if (C2 < C3) {
14361 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14362 DAG.getConstant(C3 - C2, DL, MVT::i32));
14363 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14364 DAG.getConstant(C3, DL, MVT::i32));
14365 }
14366 }
14367
14368 // Second pattern: left shift, then mask off leading bits.
14369 // FIXME: Use demanded bits?
14370 if (LeftShift && isShiftedMask_32(C1)) {
14371 uint32_t Trailing = llvm::countr_zero(C1);
14372 uint32_t C3 = llvm::countl_zero(C1);
14373 if (Trailing == C2 && C2 + C3 < 32) {
14374 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14375 DAG.getConstant(C2 + C3, DL, MVT::i32));
14376 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14377 DAG.getConstant(C3, DL, MVT::i32));
14378 }
14379 }
14380
14381 // Second pattern, reversed: right shift, then mask off trailing bits.
14382 // FIXME: Handle other patterns of known/demanded bits.
14383 if (!LeftShift && isShiftedMask_32(C1)) {
14384 uint32_t Leading = llvm::countl_zero(C1);
14385 uint32_t C3 = llvm::countr_zero(C1);
14386 if (Leading == C2 && C2 + C3 < 32) {
14387 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14388 DAG.getConstant(C2 + C3, DL, MVT::i32));
14389 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14390 DAG.getConstant(C3, DL, MVT::i32));
14391 }
14392 }
14393
14394 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14395 // if "c1 >> c2" is a cheaper immediate than "c1"
14396 if (LeftShift &&
14397 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14398
14399 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14400 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14401 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14402 DAG.getConstant(C2, DL, MVT::i32));
14403 }
14404
14405 return SDValue();
14406}
14407
14410 const ARMSubtarget *Subtarget) {
14411 // Attempt to use immediate-form VBIC
14412 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14413 SDLoc dl(N);
14414 EVT VT = N->getValueType(0);
14415 SelectionDAG &DAG = DCI.DAG;
14416
14417 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14418 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14419 return SDValue();
14420
14421 APInt SplatBits, SplatUndef;
14422 unsigned SplatBitSize;
14423 bool HasAnyUndefs;
14424 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14425 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14426 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14427 SplatBitSize == 64) {
14428 EVT VbicVT;
14429 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14430 SplatUndef.getZExtValue(), SplatBitSize,
14431 DAG, dl, VbicVT, VT, OtherModImm);
14432 if (Val.getNode()) {
14433 SDValue Input =
14434 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
14435 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14436 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
14437 }
14438 }
14439 }
14440
14441 if (!Subtarget->isThumb1Only()) {
14442 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14443 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14444 return Result;
14445
14446 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14447 return Result;
14448 }
14449
14450 if (Subtarget->isThumb1Only())
14451 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14452 return Result;
14453
14454 return SDValue();
14455}
14456
14457// Try combining OR nodes to SMULWB, SMULWT.
14460 const ARMSubtarget *Subtarget) {
14461 if (!Subtarget->hasV6Ops() ||
14462 (Subtarget->isThumb() &&
14463 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14464 return SDValue();
14465
14466 SDValue SRL = OR->getOperand(0);
14467 SDValue SHL = OR->getOperand(1);
14468
14469 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14470 SRL = OR->getOperand(1);
14471 SHL = OR->getOperand(0);
14472 }
14473 if (!isSRL16(SRL) || !isSHL16(SHL))
14474 return SDValue();
14475
14476 // The first operands to the shifts need to be the two results from the
14477 // same smul_lohi node.
14478 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14479 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14480 return SDValue();
14481
14482 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14483 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14484 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14485 return SDValue();
14486
14487 // Now we have:
14488 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14489 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14490 // For SMUWB the 16-bit value will signed extended somehow.
14491 // For SMULWT only the SRA is required.
14492 // Check both sides of SMUL_LOHI
14493 SDValue OpS16 = SMULLOHI->getOperand(0);
14494 SDValue OpS32 = SMULLOHI->getOperand(1);
14495
14496 SelectionDAG &DAG = DCI.DAG;
14497 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14498 OpS16 = OpS32;
14499 OpS32 = SMULLOHI->getOperand(0);
14500 }
14501
14502 SDLoc dl(OR);
14503 unsigned Opcode = 0;
14504 if (isS16(OpS16, DAG))
14505 Opcode = ARMISD::SMULWB;
14506 else if (isSRA16(OpS16)) {
14507 Opcode = ARMISD::SMULWT;
14508 OpS16 = OpS16->getOperand(0);
14509 }
14510 else
14511 return SDValue();
14512
14513 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14514 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14515 return SDValue(OR, 0);
14516}
14517
14520 const ARMSubtarget *Subtarget) {
14521 // BFI is only available on V6T2+
14522 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14523 return SDValue();
14524
14525 EVT VT = N->getValueType(0);
14526 SDValue N0 = N->getOperand(0);
14527 SDValue N1 = N->getOperand(1);
14528 SelectionDAG &DAG = DCI.DAG;
14529 SDLoc DL(N);
14530 // 1) or (and A, mask), val => ARMbfi A, val, mask
14531 // iff (val & mask) == val
14532 //
14533 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14534 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14535 // && mask == ~mask2
14536 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14537 // && ~mask == mask2
14538 // (i.e., copy a bitfield value into another bitfield of the same width)
14539
14540 if (VT != MVT::i32)
14541 return SDValue();
14542
14543 SDValue N00 = N0.getOperand(0);
14544
14545 // The value and the mask need to be constants so we can verify this is
14546 // actually a bitfield set. If the mask is 0xffff, we can do better
14547 // via a movt instruction, so don't use BFI in that case.
14548 SDValue MaskOp = N0.getOperand(1);
14550 if (!MaskC)
14551 return SDValue();
14552 unsigned Mask = MaskC->getZExtValue();
14553 if (Mask == 0xffff)
14554 return SDValue();
14555 SDValue Res;
14556 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14558 if (N1C) {
14559 unsigned Val = N1C->getZExtValue();
14560 if ((Val & ~Mask) != Val)
14561 return SDValue();
14562
14563 if (ARM::isBitFieldInvertedMask(Mask)) {
14564 Val >>= llvm::countr_zero(~Mask);
14565
14566 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14567 DAG.getConstant(Val, DL, MVT::i32),
14568 DAG.getConstant(Mask, DL, MVT::i32));
14569
14570 DCI.CombineTo(N, Res, false);
14571 // Return value from the original node to inform the combiner than N is
14572 // now dead.
14573 return SDValue(N, 0);
14574 }
14575 } else if (N1.getOpcode() == ISD::AND) {
14576 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14578 if (!N11C)
14579 return SDValue();
14580 unsigned Mask2 = N11C->getZExtValue();
14581
14582 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14583 // as is to match.
14584 if (ARM::isBitFieldInvertedMask(Mask) &&
14585 (Mask == ~Mask2)) {
14586 // The pack halfword instruction works better for masks that fit it,
14587 // so use that when it's available.
14588 if (Subtarget->hasDSP() &&
14589 (Mask == 0xffff || Mask == 0xffff0000))
14590 return SDValue();
14591 // 2a
14592 unsigned amt = llvm::countr_zero(Mask2);
14593 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14594 DAG.getConstant(amt, DL, MVT::i32));
14595 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14596 DAG.getConstant(Mask, DL, MVT::i32));
14597 DCI.CombineTo(N, Res, false);
14598 // Return value from the original node to inform the combiner than N is
14599 // now dead.
14600 return SDValue(N, 0);
14601 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14602 (~Mask == Mask2)) {
14603 // The pack halfword instruction works better for masks that fit it,
14604 // so use that when it's available.
14605 if (Subtarget->hasDSP() &&
14606 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14607 return SDValue();
14608 // 2b
14609 unsigned lsb = llvm::countr_zero(Mask);
14610 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14611 DAG.getConstant(lsb, DL, MVT::i32));
14612 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14613 DAG.getConstant(Mask2, DL, MVT::i32));
14614 DCI.CombineTo(N, Res, false);
14615 // Return value from the original node to inform the combiner than N is
14616 // now dead.
14617 return SDValue(N, 0);
14618 }
14619 }
14620
14621 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14622 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14624 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14625 // where lsb(mask) == #shamt and masked bits of B are known zero.
14626 SDValue ShAmt = N00.getOperand(1);
14627 unsigned ShAmtC = ShAmt->getAsZExtVal();
14628 unsigned LSB = llvm::countr_zero(Mask);
14629 if (ShAmtC != LSB)
14630 return SDValue();
14631
14632 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14633 DAG.getConstant(~Mask, DL, MVT::i32));
14634
14635 DCI.CombineTo(N, Res, false);
14636 // Return value from the original node to inform the combiner than N is
14637 // now dead.
14638 return SDValue(N, 0);
14639 }
14640
14641 return SDValue();
14642}
14643
14644static bool isValidMVECond(unsigned CC, bool IsFloat) {
14645 switch (CC) {
14646 case ARMCC::EQ:
14647 case ARMCC::NE:
14648 case ARMCC::LE:
14649 case ARMCC::GT:
14650 case ARMCC::GE:
14651 case ARMCC::LT:
14652 return true;
14653 case ARMCC::HS:
14654 case ARMCC::HI:
14655 return !IsFloat;
14656 default:
14657 return false;
14658 };
14659}
14660
14662 if (N->getOpcode() == ARMISD::VCMP)
14663 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14664 else if (N->getOpcode() == ARMISD::VCMPZ)
14665 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14666 else
14667 llvm_unreachable("Not a VCMP/VCMPZ!");
14668}
14669
14672 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14673}
14674
14676 const ARMSubtarget *Subtarget) {
14677 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14678 // together with predicates
14679 EVT VT = N->getValueType(0);
14680 SDLoc DL(N);
14681 SDValue N0 = N->getOperand(0);
14682 SDValue N1 = N->getOperand(1);
14683
14684 auto IsFreelyInvertable = [&](SDValue V) {
14685 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14686 return CanInvertMVEVCMP(V);
14687 return false;
14688 };
14689
14690 // At least one operand must be freely invertable.
14691 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14692 return SDValue();
14693
14694 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14695 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14696 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14697 return DAG.getLogicalNOT(DL, And, VT);
14698}
14699
14700/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14703 const ARMSubtarget *Subtarget) {
14704 // Attempt to use immediate-form VORR
14705 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14706 SDLoc dl(N);
14707 EVT VT = N->getValueType(0);
14708 SelectionDAG &DAG = DCI.DAG;
14709
14710 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14711 return SDValue();
14712
14713 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14714 VT == MVT::v8i1 || VT == MVT::v16i1))
14715 return PerformORCombine_i1(N, DAG, Subtarget);
14716
14717 APInt SplatBits, SplatUndef;
14718 unsigned SplatBitSize;
14719 bool HasAnyUndefs;
14720 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14721 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14722 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14723 SplatBitSize == 64) {
14724 EVT VorrVT;
14725 SDValue Val =
14726 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14727 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14728 if (Val.getNode()) {
14729 SDValue Input =
14730 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
14731 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14732 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
14733 }
14734 }
14735 }
14736
14737 if (!Subtarget->isThumb1Only()) {
14738 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14739 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14740 return Result;
14741 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14742 return Result;
14743 }
14744
14745 SDValue N0 = N->getOperand(0);
14746 SDValue N1 = N->getOperand(1);
14747
14748 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14749 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14751
14752 // The code below optimizes (or (and X, Y), Z).
14753 // The AND operand needs to have a single user to make these optimizations
14754 // profitable.
14755 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14756 return SDValue();
14757
14758 APInt SplatUndef;
14759 unsigned SplatBitSize;
14760 bool HasAnyUndefs;
14761
14762 APInt SplatBits0, SplatBits1;
14765 // Ensure that the second operand of both ands are constants
14766 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14767 HasAnyUndefs) && !HasAnyUndefs) {
14768 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14769 HasAnyUndefs) && !HasAnyUndefs) {
14770 // Ensure that the bit width of the constants are the same and that
14771 // the splat arguments are logical inverses as per the pattern we
14772 // are trying to simplify.
14773 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14774 SplatBits0 == ~SplatBits1) {
14775 // Canonicalize the vector type to make instruction selection
14776 // simpler.
14777 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14778 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14779 N0->getOperand(1),
14780 N0->getOperand(0),
14781 N1->getOperand(0));
14782 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);
14783 }
14784 }
14785 }
14786 }
14787
14788 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14789 // reasonable.
14790 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14791 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14792 return Res;
14793 }
14794
14795 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14796 return Result;
14797
14798 return SDValue();
14799}
14800
14803 const ARMSubtarget *Subtarget) {
14804 EVT VT = N->getValueType(0);
14805 SelectionDAG &DAG = DCI.DAG;
14806
14807 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14808 return SDValue();
14809
14810 if (!Subtarget->isThumb1Only()) {
14811 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14812 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14813 return Result;
14814
14815 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14816 return Result;
14817 }
14818
14819 if (Subtarget->hasMVEIntegerOps()) {
14820 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14821 SDValue N0 = N->getOperand(0);
14822 SDValue N1 = N->getOperand(1);
14823 const TargetLowering *TLI = Subtarget->getTargetLowering();
14824 if (TLI->isConstTrueVal(N1) &&
14825 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14826 if (CanInvertMVEVCMP(N0)) {
14827 SDLoc DL(N0);
14829
14831 Ops.push_back(N0->getOperand(0));
14832 if (N0->getOpcode() == ARMISD::VCMP)
14833 Ops.push_back(N0->getOperand(1));
14834 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14835 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14836 }
14837 }
14838 }
14839
14840 return SDValue();
14841}
14842
14843// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14844// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14845// their position in "to" (Rd).
14846static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14847 assert(N->getOpcode() == ARMISD::BFI);
14848
14849 SDValue From = N->getOperand(1);
14850 ToMask = ~N->getConstantOperandAPInt(2);
14851 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14852
14853 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14854 // #C in the base of the SHR.
14855 if (From->getOpcode() == ISD::SRL &&
14856 isa<ConstantSDNode>(From->getOperand(1))) {
14857 APInt Shift = From->getConstantOperandAPInt(1);
14858 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14859 FromMask <<= Shift.getLimitedValue(31);
14860 From = From->getOperand(0);
14861 }
14862
14863 return From;
14864}
14865
14866// If A and B contain one contiguous set of bits, does A | B == A . B?
14867//
14868// Neither A nor B must be zero.
14869static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14870 unsigned LastActiveBitInA = A.countr_zero();
14871 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14872 return LastActiveBitInA - 1 == FirstActiveBitInB;
14873}
14874
14876 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14877 APInt ToMask, FromMask;
14878 SDValue From = ParseBFI(N, ToMask, FromMask);
14879 SDValue To = N->getOperand(0);
14880
14881 SDValue V = To;
14882 if (V.getOpcode() != ARMISD::BFI)
14883 return SDValue();
14884
14885 APInt NewToMask, NewFromMask;
14886 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14887 if (NewFrom != From)
14888 return SDValue();
14889
14890 // Do the written bits conflict with any we've seen so far?
14891 if ((NewToMask & ToMask).getBoolValue())
14892 // Conflicting bits.
14893 return SDValue();
14894
14895 // Are the new bits contiguous when combined with the old bits?
14896 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14897 BitsProperlyConcatenate(FromMask, NewFromMask))
14898 return V;
14899 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14900 BitsProperlyConcatenate(NewFromMask, FromMask))
14901 return V;
14902
14903 return SDValue();
14904}
14905
14907 SDValue N0 = N->getOperand(0);
14908 SDValue N1 = N->getOperand(1);
14909
14910 if (N1.getOpcode() == ISD::AND) {
14911 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14912 // the bits being cleared by the AND are not demanded by the BFI.
14914 if (!N11C)
14915 return SDValue();
14916 unsigned InvMask = N->getConstantOperandVal(2);
14917 unsigned LSB = llvm::countr_zero(~InvMask);
14918 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
14919 assert(Width <
14920 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14921 "undefined behavior");
14922 unsigned Mask = (1u << Width) - 1;
14923 unsigned Mask2 = N11C->getZExtValue();
14924 if ((Mask & (~Mask2)) == 0)
14925 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14926 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14927 return SDValue();
14928 }
14929
14930 // Look for another BFI to combine with.
14931 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14932 // We've found a BFI.
14933 APInt ToMask1, FromMask1;
14934 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
14935
14936 APInt ToMask2, FromMask2;
14937 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
14938 assert(From1 == From2);
14939 (void)From2;
14940
14941 // Create a new BFI, combining the two together.
14942 APInt NewFromMask = FromMask1 | FromMask2;
14943 APInt NewToMask = ToMask1 | ToMask2;
14944
14945 EVT VT = N->getValueType(0);
14946 SDLoc dl(N);
14947
14948 if (NewFromMask[0] == 0)
14949 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
14950 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
14951 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14952 DAG.getConstant(~NewToMask, dl, VT));
14953 }
14954
14955 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14956 // that lower bit insertions are performed first, providing that M1 and M2
14957 // do no overlap. This can allow multiple BFI instructions to be combined
14958 // together by the other folds above.
14959 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14960 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14961 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14962
14963 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14964 ToMask1.countl_zero() < ToMask2.countl_zero())
14965 return SDValue();
14966
14967 EVT VT = N->getValueType(0);
14968 SDLoc dl(N);
14969 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
14970 N->getOperand(1), N->getOperand(2));
14971 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
14972 N0.getOperand(2));
14973 }
14974
14975 return SDValue();
14976}
14977
14978// Check that N is CMPZ(CSINC(0, 0, CC, X)),
14979// or CMPZ(CMOV(1, 0, CC, X))
14980// return X if valid.
14982 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
14983 return SDValue();
14984 SDValue CSInc = Cmp->getOperand(0);
14985
14986 // Ignore any `And 1` nodes that may not yet have been removed. We are
14987 // looking for a value that produces 1/0, so these have no effect on the
14988 // code.
14989 while (CSInc.getOpcode() == ISD::AND &&
14990 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
14991 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
14992 CSInc = CSInc.getOperand(0);
14993
14994 if (CSInc.getOpcode() == ARMISD::CSINC &&
14995 isNullConstant(CSInc.getOperand(0)) &&
14996 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
14998 return CSInc.getOperand(3);
14999 }
15000 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
15001 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15003 return CSInc.getOperand(3);
15004 }
15005 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
15006 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
15009 return CSInc.getOperand(3);
15010 }
15011 return SDValue();
15012}
15013
15015 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
15016 // t92: flags = ARMISD::CMPZ t74, 0
15017 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
15018 // t96: flags = ARMISD::CMPZ t93, 0
15019 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
15021 if (SDValue C = IsCMPZCSINC(N, Cond))
15022 if (Cond == ARMCC::EQ)
15023 return C;
15024 return SDValue();
15025}
15026
15028 // Fold away an unneccessary CMPZ/CSINC
15029 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
15030 // if C1==EQ -> CSXYZ A, B, C2, D
15031 // if C1==NE -> CSXYZ A, B, NOT(C2), D
15033 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
15034 if (N->getConstantOperandVal(2) == ARMCC::EQ)
15035 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15036 N->getOperand(1),
15037 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
15038 if (N->getConstantOperandVal(2) == ARMCC::NE)
15039 return DAG.getNode(
15040 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15041 N->getOperand(1),
15043 }
15044 return SDValue();
15045}
15046
15047/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
15048/// ARMISD::VMOVRRD.
15051 const ARMSubtarget *Subtarget) {
15052 // vmovrrd(vmovdrr x, y) -> x,y
15053 SDValue InDouble = N->getOperand(0);
15054 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
15055 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
15056
15057 // vmovrrd(load f64) -> (load i32), (load i32)
15058 SDNode *InNode = InDouble.getNode();
15059 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
15060 InNode->getValueType(0) == MVT::f64 &&
15061 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
15062 !cast<LoadSDNode>(InNode)->isVolatile()) {
15063 // TODO: Should this be done for non-FrameIndex operands?
15064 LoadSDNode *LD = cast<LoadSDNode>(InNode);
15065
15066 SelectionDAG &DAG = DCI.DAG;
15067 SDLoc DL(LD);
15068 SDValue BasePtr = LD->getBasePtr();
15069 SDValue NewLD1 =
15070 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
15071 LD->getAlign(), LD->getMemOperand()->getFlags());
15072
15073 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
15074 DAG.getConstant(4, DL, MVT::i32));
15075
15076 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
15077 LD->getPointerInfo().getWithOffset(4),
15078 commonAlignment(LD->getAlign(), 4),
15079 LD->getMemOperand()->getFlags());
15080
15081 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
15082 if (DCI.DAG.getDataLayout().isBigEndian())
15083 std::swap (NewLD1, NewLD2);
15084 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
15085 return Result;
15086 }
15087
15088 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
15089 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
15090 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15091 isa<ConstantSDNode>(InDouble.getOperand(1))) {
15092 SDValue BV = InDouble.getOperand(0);
15093 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
15094 // change lane order under big endian.
15095 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
15096 while (
15097 (BV.getOpcode() == ISD::BITCAST ||
15099 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
15100 BVSwap = BV.getOpcode() == ISD::BITCAST;
15101 BV = BV.getOperand(0);
15102 }
15103 if (BV.getValueType() != MVT::v4i32)
15104 return SDValue();
15105
15106 // Handle buildvectors, pulling out the correct lane depending on
15107 // endianness.
15108 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
15109 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
15110 SDValue Op0 = BV.getOperand(Offset);
15111 SDValue Op1 = BV.getOperand(Offset + 1);
15112 if (!Subtarget->isLittle() && BVSwap)
15113 std::swap(Op0, Op1);
15114
15115 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15116 }
15117
15118 // A chain of insert_vectors, grabbing the correct value of the chain of
15119 // inserts.
15120 SDValue Op0, Op1;
15121 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15122 if (isa<ConstantSDNode>(BV.getOperand(2))) {
15123 if (BV.getConstantOperandVal(2) == Offset && !Op0)
15124 Op0 = BV.getOperand(1);
15125 if (BV.getConstantOperandVal(2) == Offset + 1 && !Op1)
15126 Op1 = BV.getOperand(1);
15127 }
15128 BV = BV.getOperand(0);
15129 }
15130 if (!Subtarget->isLittle() && BVSwap)
15131 std::swap(Op0, Op1);
15132 if (Op0 && Op1)
15133 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15134 }
15135
15136 return SDValue();
15137}
15138
15139/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
15140/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
15142 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
15143 SDValue Op0 = N->getOperand(0);
15144 SDValue Op1 = N->getOperand(1);
15145 if (Op0.getOpcode() == ISD::BITCAST)
15146 Op0 = Op0.getOperand(0);
15147 if (Op1.getOpcode() == ISD::BITCAST)
15148 Op1 = Op1.getOperand(0);
15149 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
15150 Op0.getNode() == Op1.getNode() &&
15151 Op0.getResNo() == 0 && Op1.getResNo() == 1)
15152 return DAG.getNode(ISD::BITCAST, SDLoc(N),
15153 N->getValueType(0), Op0.getOperand(0));
15154 return SDValue();
15155}
15156
15159 SDValue Op0 = N->getOperand(0);
15160
15161 // VMOVhr (VMOVrh (X)) -> X
15162 if (Op0->getOpcode() == ARMISD::VMOVrh)
15163 return Op0->getOperand(0);
15164
15165 // FullFP16: half values are passed in S-registers, and we don't
15166 // need any of the bitcast and moves:
15167 //
15168 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
15169 // t5: i32 = bitcast t2
15170 // t18: f16 = ARMISD::VMOVhr t5
15171 // =>
15172 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
15173 if (Op0->getOpcode() == ISD::BITCAST) {
15174 SDValue Copy = Op0->getOperand(0);
15175 if (Copy.getValueType() == MVT::f32 &&
15176 Copy->getOpcode() == ISD::CopyFromReg) {
15177 bool HasGlue = Copy->getNumOperands() == 3;
15178 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
15179 HasGlue ? Copy->getOperand(2) : SDValue()};
15180 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
15181 SDValue NewCopy =
15183 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
15184 ArrayRef(Ops, HasGlue ? 3 : 2));
15185
15186 // Update Users, Chains, and Potential Glue.
15187 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
15188 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
15189 if (HasGlue)
15190 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
15191 NewCopy.getValue(2));
15192
15193 return NewCopy;
15194 }
15195 }
15196
15197 // fold (VMOVhr (load x)) -> (load (f16*)x)
15198 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
15199 if (LN0->hasOneUse() && LN0->isUnindexed() &&
15200 LN0->getMemoryVT() == MVT::i16) {
15201 SDValue Load =
15202 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15203 LN0->getBasePtr(), LN0->getMemOperand());
15204 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15205 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15206 return Load;
15207 }
15208 }
15209
15210 // Only the bottom 16 bits of the source register are used.
15211 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15212 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15213 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15214 return SDValue(N, 0);
15215
15216 return SDValue();
15217}
15218
15220 SDValue N0 = N->getOperand(0);
15221 EVT VT = N->getValueType(0);
15222
15223 // fold (VMOVrh (fpconst x)) -> const x
15225 APFloat V = C->getValueAPF();
15226 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15227 }
15228
15229 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15230 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15231 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15232
15233 SDValue Load =
15234 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15235 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15236 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15237 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15238 return Load;
15239 }
15240
15241 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15242 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15244 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15245 N0->getOperand(1));
15246
15247 return SDValue();
15248}
15249
15250/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15251/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15252/// i64 vector to have f64 elements, since the value can then be loaded
15253/// directly into a VFP register.
15255 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15256 for (unsigned i = 0; i < NumElts; ++i) {
15257 SDNode *Elt = N->getOperand(i).getNode();
15258 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15259 return true;
15260 }
15261 return false;
15262}
15263
15264/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15265/// ISD::BUILD_VECTOR.
15268 const ARMSubtarget *Subtarget) {
15269 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15270 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15271 // into a pair of GPRs, which is fine when the value is used as a scalar,
15272 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15273 SelectionDAG &DAG = DCI.DAG;
15274 if (N->getNumOperands() == 2)
15275 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15276 return RV;
15277
15278 // Load i64 elements as f64 values so that type legalization does not split
15279 // them up into i32 values.
15280 EVT VT = N->getValueType(0);
15281 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15282 return SDValue();
15283 SDLoc dl(N);
15285 unsigned NumElts = VT.getVectorNumElements();
15286 for (unsigned i = 0; i < NumElts; ++i) {
15287 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15288 Ops.push_back(V);
15289 // Make the DAGCombiner fold the bitcast.
15290 DCI.AddToWorklist(V.getNode());
15291 }
15292 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15293 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15294 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15295}
15296
15297/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15298static SDValue
15300 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15301 // At that time, we may have inserted bitcasts from integer to float.
15302 // If these bitcasts have survived DAGCombine, change the lowering of this
15303 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15304 // force to use floating point types.
15305
15306 // Make sure we can change the type of the vector.
15307 // This is possible iff:
15308 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15309 // 1.1. Vector is used only once.
15310 // 1.2. Use is a bit convert to an integer type.
15311 // 2. The size of its operands are 32-bits (64-bits are not legal).
15312 EVT VT = N->getValueType(0);
15313 EVT EltVT = VT.getVectorElementType();
15314
15315 // Check 1.1. and 2.
15316 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15317 return SDValue();
15318
15319 // By construction, the input type must be float.
15320 assert(EltVT == MVT::f32 && "Unexpected type!");
15321
15322 // Check 1.2.
15323 SDNode *Use = *N->user_begin();
15324 if (Use->getOpcode() != ISD::BITCAST ||
15325 Use->getValueType(0).isFloatingPoint())
15326 return SDValue();
15327
15328 // Check profitability.
15329 // Model is, if more than half of the relevant operands are bitcast from
15330 // i32, turn the build_vector into a sequence of insert_vector_elt.
15331 // Relevant operands are everything that is not statically
15332 // (i.e., at compile time) bitcasted.
15333 unsigned NumOfBitCastedElts = 0;
15334 unsigned NumElts = VT.getVectorNumElements();
15335 unsigned NumOfRelevantElts = NumElts;
15336 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15337 SDValue Elt = N->getOperand(Idx);
15338 if (Elt->getOpcode() == ISD::BITCAST) {
15339 // Assume only bit cast to i32 will go away.
15340 if (Elt->getOperand(0).getValueType() == MVT::i32)
15341 ++NumOfBitCastedElts;
15342 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15343 // Constants are statically casted, thus do not count them as
15344 // relevant operands.
15345 --NumOfRelevantElts;
15346 }
15347
15348 // Check if more than half of the elements require a non-free bitcast.
15349 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15350 return SDValue();
15351
15352 SelectionDAG &DAG = DCI.DAG;
15353 // Create the new vector type.
15354 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15355 // Check if the type is legal.
15356 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15357 if (!TLI.isTypeLegal(VecVT))
15358 return SDValue();
15359
15360 // Combine:
15361 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15362 // => BITCAST INSERT_VECTOR_ELT
15363 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15364 // (BITCAST EN), N.
15365 SDValue Vec = DAG.getUNDEF(VecVT);
15366 SDLoc dl(N);
15367 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15368 SDValue V = N->getOperand(Idx);
15369 if (V.isUndef())
15370 continue;
15371 if (V.getOpcode() == ISD::BITCAST &&
15372 V->getOperand(0).getValueType() == MVT::i32)
15373 // Fold obvious case.
15374 V = V.getOperand(0);
15375 else {
15376 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15377 // Make the DAGCombiner fold the bitcasts.
15378 DCI.AddToWorklist(V.getNode());
15379 }
15380 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15381 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15382 }
15383 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15384 // Make the DAGCombiner fold the bitcasts.
15385 DCI.AddToWorklist(Vec.getNode());
15386 return Vec;
15387}
15388
15389static SDValue
15391 EVT VT = N->getValueType(0);
15392 SDValue Op = N->getOperand(0);
15393 SDLoc dl(N);
15394
15395 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15396 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15397 // If the valuetypes are the same, we can remove the cast entirely.
15398 if (Op->getOperand(0).getValueType() == VT)
15399 return Op->getOperand(0);
15400 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15401 }
15402
15403 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15404 // more VPNOT which might get folded as else predicates.
15405 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15406 SDValue X =
15407 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15409 DCI.DAG.getConstant(65535, dl, MVT::i32));
15410 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15411 }
15412
15413 // Only the bottom 16 bits of the source register are used.
15414 if (Op.getValueType() == MVT::i32) {
15415 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15416 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15417 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15418 return SDValue(N, 0);
15419 }
15420 return SDValue();
15421}
15422
15424 const ARMSubtarget *ST) {
15425 EVT VT = N->getValueType(0);
15426 SDValue Op = N->getOperand(0);
15427 SDLoc dl(N);
15428
15429 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15430 if (ST->isLittle())
15431 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15432
15433 // VT VECTOR_REG_CAST (VT Op) -> Op
15434 if (Op.getValueType() == VT)
15435 return Op;
15436 // VECTOR_REG_CAST undef -> undef
15437 if (Op.isUndef())
15438 return DAG.getUNDEF(VT);
15439
15440 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15441 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15442 // If the valuetypes are the same, we can remove the cast entirely.
15443 if (Op->getOperand(0).getValueType() == VT)
15444 return Op->getOperand(0);
15445 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15446 }
15447
15448 return SDValue();
15449}
15450
15452 const ARMSubtarget *Subtarget) {
15453 if (!Subtarget->hasMVEIntegerOps())
15454 return SDValue();
15455
15456 EVT VT = N->getValueType(0);
15457 SDValue Op0 = N->getOperand(0);
15458 SDValue Op1 = N->getOperand(1);
15459 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
15460 SDLoc dl(N);
15461
15462 // vcmp X, 0, cc -> vcmpz X, cc
15463 if (isZeroVector(Op1))
15464 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15465
15466 unsigned SwappedCond = getSwappedCondition(Cond);
15467 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15468 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15469 if (isZeroVector(Op0))
15470 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15471 DAG.getConstant(SwappedCond, dl, MVT::i32));
15472 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15473 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15474 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15475 DAG.getConstant(SwappedCond, dl, MVT::i32));
15476 }
15477
15478 return SDValue();
15479}
15480
15481/// PerformInsertEltCombine - Target-specific dag combine xforms for
15482/// ISD::INSERT_VECTOR_ELT.
15485 // Bitcast an i64 load inserted into a vector to f64.
15486 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15487 EVT VT = N->getValueType(0);
15488 SDNode *Elt = N->getOperand(1).getNode();
15489 if (VT.getVectorElementType() != MVT::i64 ||
15490 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15491 return SDValue();
15492
15493 SelectionDAG &DAG = DCI.DAG;
15494 SDLoc dl(N);
15495 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15497 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15498 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15499 // Make the DAGCombiner fold the bitcasts.
15500 DCI.AddToWorklist(Vec.getNode());
15501 DCI.AddToWorklist(V.getNode());
15502 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15503 Vec, V, N->getOperand(2));
15504 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15505}
15506
15507// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15508// directly or bitcast to an integer if the original is a float vector.
15509// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15510// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15511static SDValue
15513 EVT VT = N->getValueType(0);
15514 SDLoc dl(N);
15515
15516 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15517 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15518 return SDValue();
15519
15520 SDValue Ext = SDValue(N, 0);
15521 if (Ext.getOpcode() == ISD::BITCAST &&
15522 Ext.getOperand(0).getValueType() == MVT::f32)
15523 Ext = Ext.getOperand(0);
15524 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15525 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
15526 Ext.getConstantOperandVal(1) % 2 != 0)
15527 return SDValue();
15528 if (Ext->hasOneUse() && (Ext->user_begin()->getOpcode() == ISD::SINT_TO_FP ||
15529 Ext->user_begin()->getOpcode() == ISD::UINT_TO_FP))
15530 return SDValue();
15531
15532 SDValue Op0 = Ext.getOperand(0);
15533 EVT VecVT = Op0.getValueType();
15534 unsigned ResNo = Op0.getResNo();
15535 unsigned Lane = Ext.getConstantOperandVal(1);
15536 if (VecVT.getVectorNumElements() != 4)
15537 return SDValue();
15538
15539 // Find another extract, of Lane + 1
15540 auto OtherIt = find_if(Op0->users(), [&](SDNode *V) {
15541 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15542 isa<ConstantSDNode>(V->getOperand(1)) &&
15543 V->getConstantOperandVal(1) == Lane + 1 &&
15544 V->getOperand(0).getResNo() == ResNo;
15545 });
15546 if (OtherIt == Op0->users().end())
15547 return SDValue();
15548
15549 // For float extracts, we need to be converting to a i32 for both vector
15550 // lanes.
15551 SDValue OtherExt(*OtherIt, 0);
15552 if (OtherExt.getValueType() != MVT::i32) {
15553 if (!OtherExt->hasOneUse() ||
15554 OtherExt->user_begin()->getOpcode() != ISD::BITCAST ||
15555 OtherExt->user_begin()->getValueType(0) != MVT::i32)
15556 return SDValue();
15557 OtherExt = SDValue(*OtherExt->user_begin(), 0);
15558 }
15559
15560 // Convert the type to a f64 and extract with a VMOVRRD.
15561 SDValue F64 = DCI.DAG.getNode(
15562 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15563 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15564 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15565 SDValue VMOVRRD =
15566 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15567
15568 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15569 return VMOVRRD;
15570}
15571
15574 const ARMSubtarget *ST) {
15575 SDValue Op0 = N->getOperand(0);
15576 EVT VT = N->getValueType(0);
15577 SDLoc dl(N);
15578
15579 // extract (vdup x) -> x
15580 if (Op0->getOpcode() == ARMISD::VDUP) {
15581 SDValue X = Op0->getOperand(0);
15582 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15583 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15584 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15585 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15586 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15587 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15588
15589 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15590 X = X->getOperand(0);
15591 if (X.getValueType() == VT)
15592 return X;
15593 }
15594
15595 // extract ARM_BUILD_VECTOR -> x
15596 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15597 isa<ConstantSDNode>(N->getOperand(1)) &&
15598 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15599 return Op0.getOperand(N->getConstantOperandVal(1));
15600 }
15601
15602 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15603 if (Op0.getValueType() == MVT::v4i32 &&
15604 isa<ConstantSDNode>(N->getOperand(1)) &&
15605 Op0.getOpcode() == ISD::BITCAST &&
15607 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15608 SDValue BV = Op0.getOperand(0);
15609 unsigned Offset = N->getConstantOperandVal(1);
15610 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15611 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15612 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15613 }
15614
15615 // extract x, n; extract x, n+1 -> VMOVRRD x
15616 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15617 return R;
15618
15619 // extract (MVETrunc(x)) -> extract x
15620 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15621 unsigned Idx = N->getConstantOperandVal(1);
15622 unsigned Vec =
15624 unsigned SubIdx =
15626 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15627 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15628 }
15629
15630 return SDValue();
15631}
15632
15634 SDValue Op = N->getOperand(0);
15635 EVT VT = N->getValueType(0);
15636
15637 // sext_inreg(VGETLANEu) -> VGETLANEs
15638 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15639 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15640 Op.getOperand(0).getValueType().getScalarType())
15641 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15642 Op.getOperand(1));
15643
15644 return SDValue();
15645}
15646
15647static SDValue
15649 SDValue Vec = N->getOperand(0);
15650 SDValue SubVec = N->getOperand(1);
15651 uint64_t IdxVal = N->getConstantOperandVal(2);
15652 EVT VecVT = Vec.getValueType();
15653 EVT SubVT = SubVec.getValueType();
15654
15655 // Only do this for legal fixed vector types.
15656 if (!VecVT.isFixedLengthVector() ||
15657 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15659 return SDValue();
15660
15661 // Ignore widening patterns.
15662 if (IdxVal == 0 && Vec.isUndef())
15663 return SDValue();
15664
15665 // Subvector must be half the width and an "aligned" insertion.
15666 unsigned NumSubElts = SubVT.getVectorNumElements();
15667 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15668 (IdxVal != 0 && IdxVal != NumSubElts))
15669 return SDValue();
15670
15671 // Fold insert_subvector -> concat_vectors
15672 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15673 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15674 SDLoc DL(N);
15675 SDValue Lo, Hi;
15676 if (IdxVal == 0) {
15677 Lo = SubVec;
15678 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15679 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15680 } else {
15681 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15682 DCI.DAG.getVectorIdxConstant(0, DL));
15683 Hi = SubVec;
15684 }
15685 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15686}
15687
15688// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15690 SelectionDAG &DAG) {
15691 SDValue Trunc = N->getOperand(0);
15692 EVT VT = Trunc.getValueType();
15693 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15694 return SDValue();
15695
15696 SDLoc DL(Trunc);
15697 if (isVMOVNTruncMask(N->getMask(), VT, false))
15698 return DAG.getNode(
15699 ARMISD::VMOVN, DL, VT,
15700 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15701 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15702 DAG.getConstant(1, DL, MVT::i32));
15703 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15704 return DAG.getNode(
15705 ARMISD::VMOVN, DL, VT,
15706 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15707 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15708 DAG.getConstant(1, DL, MVT::i32));
15709 return SDValue();
15710}
15711
15712/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15713/// ISD::VECTOR_SHUFFLE.
15716 return R;
15717
15718 // The LLVM shufflevector instruction does not require the shuffle mask
15719 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15720 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15721 // operands do not match the mask length, they are extended by concatenating
15722 // them with undef vectors. That is probably the right thing for other
15723 // targets, but for NEON it is better to concatenate two double-register
15724 // size vector operands into a single quad-register size vector. Do that
15725 // transformation here:
15726 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15727 // shuffle(concat(v1, v2), undef)
15728 SDValue Op0 = N->getOperand(0);
15729 SDValue Op1 = N->getOperand(1);
15730 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15731 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15732 Op0.getNumOperands() != 2 ||
15733 Op1.getNumOperands() != 2)
15734 return SDValue();
15735 SDValue Concat0Op1 = Op0.getOperand(1);
15736 SDValue Concat1Op1 = Op1.getOperand(1);
15737 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15738 return SDValue();
15739 // Skip the transformation if any of the types are illegal.
15740 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15741 EVT VT = N->getValueType(0);
15742 if (!TLI.isTypeLegal(VT) ||
15743 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15744 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15745 return SDValue();
15746
15747 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15748 Op0.getOperand(0), Op1.getOperand(0));
15749 // Translate the shuffle mask.
15750 SmallVector<int, 16> NewMask;
15751 unsigned NumElts = VT.getVectorNumElements();
15752 unsigned HalfElts = NumElts/2;
15754 for (unsigned n = 0; n < NumElts; ++n) {
15755 int MaskElt = SVN->getMaskElt(n);
15756 int NewElt = -1;
15757 if (MaskElt < (int)HalfElts)
15758 NewElt = MaskElt;
15759 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15760 NewElt = HalfElts + MaskElt - NumElts;
15761 NewMask.push_back(NewElt);
15762 }
15763 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15764 DAG.getUNDEF(VT), NewMask);
15765}
15766
15767/// Load/store instruction that can be merged with a base address
15768/// update
15773 unsigned AddrOpIdx;
15774};
15775
15777 /// Instruction that updates a pointer
15779 /// Pointer increment operand
15781 /// Pointer increment value if it is a constant, or 0 otherwise
15782 unsigned ConstInc;
15783};
15784
15786 // Check that the add is independent of the load/store.
15787 // Otherwise, folding it would create a cycle. Search through Addr
15788 // as well, since the User may not be a direct user of Addr and
15789 // only share a base pointer.
15792 Worklist.push_back(N);
15793 Worklist.push_back(User);
15794 const unsigned MaxSteps = 1024;
15795 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
15796 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
15797 return false;
15798 return true;
15799}
15800
15802 struct BaseUpdateUser &User,
15803 bool SimpleConstIncOnly,
15805 SelectionDAG &DAG = DCI.DAG;
15806 SDNode *N = Target.N;
15807 MemSDNode *MemN = cast<MemSDNode>(N);
15808 SDLoc dl(N);
15809
15810 // Find the new opcode for the updating load/store.
15811 bool isLoadOp = true;
15812 bool isLaneOp = false;
15813 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15814 // as an operand.
15815 bool hasAlignment = true;
15816 unsigned NewOpc = 0;
15817 unsigned NumVecs = 0;
15818 if (Target.isIntrinsic) {
15819 unsigned IntNo = N->getConstantOperandVal(1);
15820 switch (IntNo) {
15821 default:
15822 llvm_unreachable("unexpected intrinsic for Neon base update");
15823 case Intrinsic::arm_neon_vld1:
15824 NewOpc = ARMISD::VLD1_UPD;
15825 NumVecs = 1;
15826 break;
15827 case Intrinsic::arm_neon_vld2:
15828 NewOpc = ARMISD::VLD2_UPD;
15829 NumVecs = 2;
15830 break;
15831 case Intrinsic::arm_neon_vld3:
15832 NewOpc = ARMISD::VLD3_UPD;
15833 NumVecs = 3;
15834 break;
15835 case Intrinsic::arm_neon_vld4:
15836 NewOpc = ARMISD::VLD4_UPD;
15837 NumVecs = 4;
15838 break;
15839 case Intrinsic::arm_neon_vld1x2:
15840 NewOpc = ARMISD::VLD1x2_UPD;
15841 NumVecs = 2;
15842 hasAlignment = false;
15843 break;
15844 case Intrinsic::arm_neon_vld1x3:
15845 NewOpc = ARMISD::VLD1x3_UPD;
15846 NumVecs = 3;
15847 hasAlignment = false;
15848 break;
15849 case Intrinsic::arm_neon_vld1x4:
15850 NewOpc = ARMISD::VLD1x4_UPD;
15851 NumVecs = 4;
15852 hasAlignment = false;
15853 break;
15854 case Intrinsic::arm_neon_vld2dup:
15855 NewOpc = ARMISD::VLD2DUP_UPD;
15856 NumVecs = 2;
15857 break;
15858 case Intrinsic::arm_neon_vld3dup:
15859 NewOpc = ARMISD::VLD3DUP_UPD;
15860 NumVecs = 3;
15861 break;
15862 case Intrinsic::arm_neon_vld4dup:
15863 NewOpc = ARMISD::VLD4DUP_UPD;
15864 NumVecs = 4;
15865 break;
15866 case Intrinsic::arm_neon_vld2lane:
15867 NewOpc = ARMISD::VLD2LN_UPD;
15868 NumVecs = 2;
15869 isLaneOp = true;
15870 break;
15871 case Intrinsic::arm_neon_vld3lane:
15872 NewOpc = ARMISD::VLD3LN_UPD;
15873 NumVecs = 3;
15874 isLaneOp = true;
15875 break;
15876 case Intrinsic::arm_neon_vld4lane:
15877 NewOpc = ARMISD::VLD4LN_UPD;
15878 NumVecs = 4;
15879 isLaneOp = true;
15880 break;
15881 case Intrinsic::arm_neon_vst1:
15882 NewOpc = ARMISD::VST1_UPD;
15883 NumVecs = 1;
15884 isLoadOp = false;
15885 break;
15886 case Intrinsic::arm_neon_vst2:
15887 NewOpc = ARMISD::VST2_UPD;
15888 NumVecs = 2;
15889 isLoadOp = false;
15890 break;
15891 case Intrinsic::arm_neon_vst3:
15892 NewOpc = ARMISD::VST3_UPD;
15893 NumVecs = 3;
15894 isLoadOp = false;
15895 break;
15896 case Intrinsic::arm_neon_vst4:
15897 NewOpc = ARMISD::VST4_UPD;
15898 NumVecs = 4;
15899 isLoadOp = false;
15900 break;
15901 case Intrinsic::arm_neon_vst2lane:
15902 NewOpc = ARMISD::VST2LN_UPD;
15903 NumVecs = 2;
15904 isLoadOp = false;
15905 isLaneOp = true;
15906 break;
15907 case Intrinsic::arm_neon_vst3lane:
15908 NewOpc = ARMISD::VST3LN_UPD;
15909 NumVecs = 3;
15910 isLoadOp = false;
15911 isLaneOp = true;
15912 break;
15913 case Intrinsic::arm_neon_vst4lane:
15914 NewOpc = ARMISD::VST4LN_UPD;
15915 NumVecs = 4;
15916 isLoadOp = false;
15917 isLaneOp = true;
15918 break;
15919 case Intrinsic::arm_neon_vst1x2:
15920 NewOpc = ARMISD::VST1x2_UPD;
15921 NumVecs = 2;
15922 isLoadOp = false;
15923 hasAlignment = false;
15924 break;
15925 case Intrinsic::arm_neon_vst1x3:
15926 NewOpc = ARMISD::VST1x3_UPD;
15927 NumVecs = 3;
15928 isLoadOp = false;
15929 hasAlignment = false;
15930 break;
15931 case Intrinsic::arm_neon_vst1x4:
15932 NewOpc = ARMISD::VST1x4_UPD;
15933 NumVecs = 4;
15934 isLoadOp = false;
15935 hasAlignment = false;
15936 break;
15937 }
15938 } else {
15939 isLaneOp = true;
15940 switch (N->getOpcode()) {
15941 default:
15942 llvm_unreachable("unexpected opcode for Neon base update");
15943 case ARMISD::VLD1DUP:
15944 NewOpc = ARMISD::VLD1DUP_UPD;
15945 NumVecs = 1;
15946 break;
15947 case ARMISD::VLD2DUP:
15948 NewOpc = ARMISD::VLD2DUP_UPD;
15949 NumVecs = 2;
15950 break;
15951 case ARMISD::VLD3DUP:
15952 NewOpc = ARMISD::VLD3DUP_UPD;
15953 NumVecs = 3;
15954 break;
15955 case ARMISD::VLD4DUP:
15956 NewOpc = ARMISD::VLD4DUP_UPD;
15957 NumVecs = 4;
15958 break;
15959 case ISD::LOAD:
15960 NewOpc = ARMISD::VLD1_UPD;
15961 NumVecs = 1;
15962 isLaneOp = false;
15963 break;
15964 case ISD::STORE:
15965 NewOpc = ARMISD::VST1_UPD;
15966 NumVecs = 1;
15967 isLaneOp = false;
15968 isLoadOp = false;
15969 break;
15970 }
15971 }
15972
15973 // Find the size of memory referenced by the load/store.
15974 EVT VecTy;
15975 if (isLoadOp) {
15976 VecTy = N->getValueType(0);
15977 } else if (Target.isIntrinsic) {
15978 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
15979 } else {
15980 assert(Target.isStore &&
15981 "Node has to be a load, a store, or an intrinsic!");
15982 VecTy = N->getOperand(1).getValueType();
15983 }
15984
15985 bool isVLDDUPOp =
15986 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
15987 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
15988
15989 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15990 if (isLaneOp || isVLDDUPOp)
15991 NumBytes /= VecTy.getVectorNumElements();
15992
15993 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
15994 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
15995 // separate instructions that make it harder to use a non-constant update.
15996 return false;
15997 }
15998
15999 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
16000 return false;
16001
16002 if (!isValidBaseUpdate(N, User.N))
16003 return false;
16004
16005 // OK, we found an ADD we can fold into the base update.
16006 // Now, create a _UPD node, taking care of not breaking alignment.
16007
16008 EVT AlignedVecTy = VecTy;
16009 Align Alignment = MemN->getAlign();
16010
16011 // If this is a less-than-standard-aligned load/store, change the type to
16012 // match the standard alignment.
16013 // The alignment is overlooked when selecting _UPD variants; and it's
16014 // easier to introduce bitcasts here than fix that.
16015 // There are 3 ways to get to this base-update combine:
16016 // - intrinsics: they are assumed to be properly aligned (to the standard
16017 // alignment of the memory type), so we don't need to do anything.
16018 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
16019 // intrinsics, so, likewise, there's nothing to do.
16020 // - generic load/store instructions: the alignment is specified as an
16021 // explicit operand, rather than implicitly as the standard alignment
16022 // of the memory type (like the intrisics). We need to change the
16023 // memory type to match the explicit alignment. That way, we don't
16024 // generate non-standard-aligned ARMISD::VLDx nodes.
16025 if (isa<LSBaseSDNode>(N)) {
16026 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
16027 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
16028 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
16029 assert(!isLaneOp && "Unexpected generic load/store lane.");
16030 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
16031 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
16032 }
16033 // Don't set an explicit alignment on regular load/stores that we want
16034 // to transform to VLD/VST 1_UPD nodes.
16035 // This matches the behavior of regular load/stores, which only get an
16036 // explicit alignment if the MMO alignment is larger than the standard
16037 // alignment of the memory type.
16038 // Intrinsics, however, always get an explicit alignment, set to the
16039 // alignment of the MMO.
16040 Alignment = Align(1);
16041 }
16042
16043 // Create the new updating load/store node.
16044 // First, create an SDVTList for the new updating node's results.
16045 EVT Tys[6];
16046 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16047 unsigned n;
16048 for (n = 0; n < NumResultVecs; ++n)
16049 Tys[n] = AlignedVecTy;
16050 Tys[n++] = MVT::i32;
16051 Tys[n] = MVT::Other;
16052 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16053
16054 // Then, gather the new node's operands.
16056 Ops.push_back(N->getOperand(0)); // incoming chain
16057 Ops.push_back(N->getOperand(Target.AddrOpIdx));
16058 Ops.push_back(User.Inc);
16059
16060 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
16061 // Try to match the intrinsic's signature
16062 Ops.push_back(StN->getValue());
16063 } else {
16064 // Loads (and of course intrinsics) match the intrinsics' signature,
16065 // so just add all but the alignment operand.
16066 unsigned LastOperand =
16067 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
16068 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
16069 Ops.push_back(N->getOperand(i));
16070 }
16071
16072 // For all node types, the alignment operand is always the last one.
16073 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
16074
16075 // If this is a non-standard-aligned STORE, the penultimate operand is the
16076 // stored value. Bitcast it to the aligned type.
16077 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
16078 SDValue &StVal = Ops[Ops.size() - 2];
16079 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
16080 }
16081
16082 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
16083 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
16084 MemN->getMemOperand());
16085
16086 // Update the uses.
16087 SmallVector<SDValue, 5> NewResults;
16088 for (unsigned i = 0; i < NumResultVecs; ++i)
16089 NewResults.push_back(SDValue(UpdN.getNode(), i));
16090
16091 // If this is an non-standard-aligned LOAD, the first result is the loaded
16092 // value. Bitcast it to the expected result type.
16093 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
16094 SDValue &LdVal = NewResults[0];
16095 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
16096 }
16097
16098 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16099 DCI.CombineTo(N, NewResults);
16100 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
16101
16102 return true;
16103}
16104
16105// If (opcode ptr inc) is and ADD-like instruction, return the
16106// increment value. Otherwise return 0.
16107static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
16108 SDValue Inc, const SelectionDAG &DAG) {
16110 if (!CInc)
16111 return 0;
16112
16113 switch (Opcode) {
16114 case ARMISD::VLD1_UPD:
16115 case ISD::ADD:
16116 return CInc->getZExtValue();
16117 case ISD::OR: {
16118 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
16119 // (OR ptr inc) is the same as (ADD ptr inc)
16120 return CInc->getZExtValue();
16121 }
16122 return 0;
16123 }
16124 default:
16125 return 0;
16126 }
16127}
16128
16130 switch (N->getOpcode()) {
16131 case ISD::ADD:
16132 case ISD::OR: {
16133 if (isa<ConstantSDNode>(N->getOperand(1))) {
16134 *Ptr = N->getOperand(0);
16135 *CInc = N->getOperand(1);
16136 return true;
16137 }
16138 return false;
16139 }
16140 case ARMISD::VLD1_UPD: {
16141 if (isa<ConstantSDNode>(N->getOperand(2))) {
16142 *Ptr = N->getOperand(1);
16143 *CInc = N->getOperand(2);
16144 return true;
16145 }
16146 return false;
16147 }
16148 default:
16149 return false;
16150 }
16151}
16152
16153/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
16154/// NEON load/store intrinsics, and generic vector load/stores, to merge
16155/// base address updates.
16156/// For generic load/stores, the memory type is assumed to be a vector.
16157/// The caller is assumed to have checked legality.
16160 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
16161 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
16162 const bool isStore = N->getOpcode() == ISD::STORE;
16163 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
16164 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
16165
16166 // Limit the number of possible base-updates we look at to prevent degenerate
16167 // cases.
16168 unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
16169
16170 SDValue Addr = N->getOperand(AddrOpIdx);
16171
16173
16174 // Search for a use of the address operand that is an increment.
16175 for (SDUse &Use : Addr->uses()) {
16176 SDNode *User = Use.getUser();
16177 if (Use.getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
16178 continue;
16179
16180 SDValue Inc = User->getOperand(Use.getOperandNo() == 1 ? 0 : 1);
16181 unsigned ConstInc =
16182 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
16183
16184 if (ConstInc || User->getOpcode() == ISD::ADD) {
16185 BaseUpdates.push_back({User, Inc, ConstInc});
16186 if (BaseUpdates.size() >= MaxBaseUpdates)
16187 break;
16188 }
16189 }
16190
16191 // If the address is a constant pointer increment itself, find
16192 // another constant increment that has the same base operand
16193 SDValue Base;
16194 SDValue CInc;
16195 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
16196 unsigned Offset =
16197 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
16198 for (SDUse &Use : Base->uses()) {
16199
16200 SDNode *User = Use.getUser();
16201 if (Use.getResNo() != Base.getResNo() || User == Addr.getNode() ||
16202 User->getNumOperands() != 2)
16203 continue;
16204
16205 SDValue UserInc = User->getOperand(Use.getOperandNo() == 0 ? 1 : 0);
16206 unsigned UserOffset =
16207 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16208
16209 if (!UserOffset || UserOffset <= Offset)
16210 continue;
16211
16212 unsigned NewConstInc = UserOffset - Offset;
16213 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16214 BaseUpdates.push_back({User, NewInc, NewConstInc});
16215 if (BaseUpdates.size() >= MaxBaseUpdates)
16216 break;
16217 }
16218 }
16219
16220 // Try to fold the load/store with an update that matches memory
16221 // access size. This should work well for sequential loads.
16222 unsigned NumValidUpd = BaseUpdates.size();
16223 for (unsigned I = 0; I < NumValidUpd; I++) {
16224 BaseUpdateUser &User = BaseUpdates[I];
16225 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16226 return SDValue();
16227 }
16228
16229 // Try to fold with other users. Non-constant updates are considered
16230 // first, and constant updates are sorted to not break a sequence of
16231 // strided accesses (if there is any).
16232 llvm::stable_sort(BaseUpdates,
16233 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16234 return LHS.ConstInc < RHS.ConstInc;
16235 });
16236 for (BaseUpdateUser &User : BaseUpdates) {
16237 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16238 return SDValue();
16239 }
16240 return SDValue();
16241}
16242
16245 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16246 return SDValue();
16247
16248 return CombineBaseUpdate(N, DCI);
16249}
16250
16253 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16254 return SDValue();
16255
16256 SelectionDAG &DAG = DCI.DAG;
16257 SDValue Addr = N->getOperand(2);
16258 MemSDNode *MemN = cast<MemSDNode>(N);
16259 SDLoc dl(N);
16260
16261 // For the stores, where there are multiple intrinsics we only actually want
16262 // to post-inc the last of the them.
16263 unsigned IntNo = N->getConstantOperandVal(1);
16264 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16265 return SDValue();
16266 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16267 return SDValue();
16268
16269 // Search for a use of the address operand that is an increment.
16270 for (SDUse &Use : Addr->uses()) {
16271 SDNode *User = Use.getUser();
16272 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
16273 continue;
16274
16275 // Check that the add is independent of the load/store. Otherwise, folding
16276 // it would create a cycle. We can avoid searching through Addr as it's a
16277 // predecessor to both.
16280 Visited.insert(Addr.getNode());
16281 Worklist.push_back(N);
16282 Worklist.push_back(User);
16283 const unsigned MaxSteps = 1024;
16284 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
16285 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
16286 continue;
16287
16288 // Find the new opcode for the updating load/store.
16289 bool isLoadOp = true;
16290 unsigned NewOpc = 0;
16291 unsigned NumVecs = 0;
16292 switch (IntNo) {
16293 default:
16294 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16295 case Intrinsic::arm_mve_vld2q:
16296 NewOpc = ARMISD::VLD2_UPD;
16297 NumVecs = 2;
16298 break;
16299 case Intrinsic::arm_mve_vld4q:
16300 NewOpc = ARMISD::VLD4_UPD;
16301 NumVecs = 4;
16302 break;
16303 case Intrinsic::arm_mve_vst2q:
16304 NewOpc = ARMISD::VST2_UPD;
16305 NumVecs = 2;
16306 isLoadOp = false;
16307 break;
16308 case Intrinsic::arm_mve_vst4q:
16309 NewOpc = ARMISD::VST4_UPD;
16310 NumVecs = 4;
16311 isLoadOp = false;
16312 break;
16313 }
16314
16315 // Find the size of memory referenced by the load/store.
16316 EVT VecTy;
16317 if (isLoadOp) {
16318 VecTy = N->getValueType(0);
16319 } else {
16320 VecTy = N->getOperand(3).getValueType();
16321 }
16322
16323 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16324
16325 // If the increment is a constant, it must match the memory ref size.
16326 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16328 if (!CInc || CInc->getZExtValue() != NumBytes)
16329 continue;
16330
16331 // Create the new updating load/store node.
16332 // First, create an SDVTList for the new updating node's results.
16333 EVT Tys[6];
16334 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16335 unsigned n;
16336 for (n = 0; n < NumResultVecs; ++n)
16337 Tys[n] = VecTy;
16338 Tys[n++] = MVT::i32;
16339 Tys[n] = MVT::Other;
16340 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16341
16342 // Then, gather the new node's operands.
16344 Ops.push_back(N->getOperand(0)); // incoming chain
16345 Ops.push_back(N->getOperand(2)); // ptr
16346 Ops.push_back(Inc);
16347
16348 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16349 Ops.push_back(N->getOperand(i));
16350
16351 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16352 MemN->getMemOperand());
16353
16354 // Update the uses.
16355 SmallVector<SDValue, 5> NewResults;
16356 for (unsigned i = 0; i < NumResultVecs; ++i)
16357 NewResults.push_back(SDValue(UpdN.getNode(), i));
16358
16359 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16360 DCI.CombineTo(N, NewResults);
16361 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16362
16363 break;
16364 }
16365
16366 return SDValue();
16367}
16368
16369/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16370/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16371/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16372/// return true.
16374 SelectionDAG &DAG = DCI.DAG;
16375 EVT VT = N->getValueType(0);
16376 // vldN-dup instructions only support 64-bit vectors for N > 1.
16377 if (!VT.is64BitVector())
16378 return false;
16379
16380 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16381 SDNode *VLD = N->getOperand(0).getNode();
16382 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16383 return false;
16384 unsigned NumVecs = 0;
16385 unsigned NewOpc = 0;
16386 unsigned IntNo = VLD->getConstantOperandVal(1);
16387 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16388 NumVecs = 2;
16389 NewOpc = ARMISD::VLD2DUP;
16390 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16391 NumVecs = 3;
16392 NewOpc = ARMISD::VLD3DUP;
16393 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16394 NumVecs = 4;
16395 NewOpc = ARMISD::VLD4DUP;
16396 } else {
16397 return false;
16398 }
16399
16400 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16401 // numbers match the load.
16402 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16403 for (SDUse &Use : VLD->uses()) {
16404 // Ignore uses of the chain result.
16405 if (Use.getResNo() == NumVecs)
16406 continue;
16407 SDNode *User = Use.getUser();
16408 if (User->getOpcode() != ARMISD::VDUPLANE ||
16409 VLDLaneNo != User->getConstantOperandVal(1))
16410 return false;
16411 }
16412
16413 // Create the vldN-dup node.
16414 EVT Tys[5];
16415 unsigned n;
16416 for (n = 0; n < NumVecs; ++n)
16417 Tys[n] = VT;
16418 Tys[n] = MVT::Other;
16419 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16420 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16422 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16423 Ops, VLDMemInt->getMemoryVT(),
16424 VLDMemInt->getMemOperand());
16425
16426 // Update the uses.
16427 for (SDUse &Use : VLD->uses()) {
16428 unsigned ResNo = Use.getResNo();
16429 // Ignore uses of the chain result.
16430 if (ResNo == NumVecs)
16431 continue;
16432 DCI.CombineTo(Use.getUser(), SDValue(VLDDup.getNode(), ResNo));
16433 }
16434
16435 // Now the vldN-lane intrinsic is dead except for its chain result.
16436 // Update uses of the chain.
16437 std::vector<SDValue> VLDDupResults;
16438 for (unsigned n = 0; n < NumVecs; ++n)
16439 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16440 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16441 DCI.CombineTo(VLD, VLDDupResults);
16442
16443 return true;
16444}
16445
16446/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16447/// ARMISD::VDUPLANE.
16450 const ARMSubtarget *Subtarget) {
16451 SDValue Op = N->getOperand(0);
16452 EVT VT = N->getValueType(0);
16453
16454 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16455 if (Subtarget->hasMVEIntegerOps()) {
16456 EVT ExtractVT = VT.getVectorElementType();
16457 // We need to ensure we are creating a legal type.
16458 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16459 ExtractVT = MVT::i32;
16460 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16461 N->getOperand(0), N->getOperand(1));
16462 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16463 }
16464
16465 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16466 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16467 if (CombineVLDDUP(N, DCI))
16468 return SDValue(N, 0);
16469
16470 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16471 // redundant. Ignore bit_converts for now; element sizes are checked below.
16472 while (Op.getOpcode() == ISD::BITCAST)
16473 Op = Op.getOperand(0);
16474 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16475 return SDValue();
16476
16477 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16478 unsigned EltSize = Op.getScalarValueSizeInBits();
16479 // The canonical VMOV for a zero vector uses a 32-bit element size.
16480 unsigned Imm = Op.getConstantOperandVal(0);
16481 unsigned EltBits;
16482 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16483 EltSize = 8;
16484 if (EltSize > VT.getScalarSizeInBits())
16485 return SDValue();
16486
16487 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16488}
16489
16490/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16492 const ARMSubtarget *Subtarget) {
16493 SDValue Op = N->getOperand(0);
16494 SDLoc dl(N);
16495
16496 if (Subtarget->hasMVEIntegerOps()) {
16497 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16498 // need to come from a GPR.
16499 if (Op.getValueType() == MVT::f32)
16500 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16501 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16502 else if (Op.getValueType() == MVT::f16)
16503 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16504 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16505 }
16506
16507 if (!Subtarget->hasNEON())
16508 return SDValue();
16509
16510 // Match VDUP(LOAD) -> VLD1DUP.
16511 // We match this pattern here rather than waiting for isel because the
16512 // transform is only legal for unindexed loads.
16513 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16514 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16515 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16516 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16517 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16518 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16519 SDValue VLDDup =
16521 LD->getMemoryVT(), LD->getMemOperand());
16522 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16523 return VLDDup;
16524 }
16525
16526 return SDValue();
16527}
16528
16531 const ARMSubtarget *Subtarget) {
16532 EVT VT = N->getValueType(0);
16533
16534 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16535 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16537 return CombineBaseUpdate(N, DCI);
16538
16539 return SDValue();
16540}
16541
16542// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16543// pack all of the elements in one place. Next, store to memory in fewer
16544// chunks.
16546 SelectionDAG &DAG) {
16547 SDValue StVal = St->getValue();
16548 EVT VT = StVal.getValueType();
16549 if (!St->isTruncatingStore() || !VT.isVector())
16550 return SDValue();
16551 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16552 EVT StVT = St->getMemoryVT();
16553 unsigned NumElems = VT.getVectorNumElements();
16554 assert(StVT != VT && "Cannot truncate to the same type");
16555 unsigned FromEltSz = VT.getScalarSizeInBits();
16556 unsigned ToEltSz = StVT.getScalarSizeInBits();
16557
16558 // From, To sizes and ElemCount must be pow of two
16559 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16560 return SDValue();
16561
16562 // We are going to use the original vector elt for storing.
16563 // Accumulated smaller vector elements must be a multiple of the store size.
16564 if (0 != (NumElems * FromEltSz) % ToEltSz)
16565 return SDValue();
16566
16567 unsigned SizeRatio = FromEltSz / ToEltSz;
16568 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16569
16570 // Create a type on which we perform the shuffle.
16571 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16572 NumElems * SizeRatio);
16573 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16574
16575 SDLoc DL(St);
16576 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16577 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16578 for (unsigned i = 0; i < NumElems; ++i)
16579 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16580 : i * SizeRatio;
16581
16582 // Can't shuffle using an illegal type.
16583 if (!TLI.isTypeLegal(WideVecVT))
16584 return SDValue();
16585
16586 SDValue Shuff = DAG.getVectorShuffle(
16587 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16588 // At this point all of the data is stored at the bottom of the
16589 // register. We now need to save it to mem.
16590
16591 // Find the largest store unit
16592 MVT StoreType = MVT::i8;
16593 for (MVT Tp : MVT::integer_valuetypes()) {
16594 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16595 StoreType = Tp;
16596 }
16597 // Didn't find a legal store type.
16598 if (!TLI.isTypeLegal(StoreType))
16599 return SDValue();
16600
16601 // Bitcast the original vector into a vector of store-size units
16602 EVT StoreVecVT =
16603 EVT::getVectorVT(*DAG.getContext(), StoreType,
16604 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16605 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16606 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16608 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16609 TLI.getPointerTy(DAG.getDataLayout()));
16610 SDValue BasePtr = St->getBasePtr();
16611
16612 // Perform one or more big stores into memory.
16613 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16614 for (unsigned I = 0; I < E; I++) {
16615 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16616 ShuffWide, DAG.getIntPtrConstant(I, DL));
16617 SDValue Ch =
16618 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16619 St->getAlign(), St->getMemOperand()->getFlags());
16620 BasePtr =
16621 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16622 Chains.push_back(Ch);
16623 }
16624 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16625}
16626
16627// Try taking a single vector store from an fpround (which would otherwise turn
16628// into an expensive buildvector) and splitting it into a series of narrowing
16629// stores.
16631 SelectionDAG &DAG) {
16632 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16633 return SDValue();
16634 SDValue Trunc = St->getValue();
16635 if (Trunc->getOpcode() != ISD::FP_ROUND)
16636 return SDValue();
16637 EVT FromVT = Trunc->getOperand(0).getValueType();
16638 EVT ToVT = Trunc.getValueType();
16639 if (!ToVT.isVector())
16640 return SDValue();
16642 EVT ToEltVT = ToVT.getVectorElementType();
16643 EVT FromEltVT = FromVT.getVectorElementType();
16644
16645 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16646 return SDValue();
16647
16648 unsigned NumElements = 4;
16649 if (FromVT.getVectorNumElements() % NumElements != 0)
16650 return SDValue();
16651
16652 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
16653 // use the VMOVN over splitting the store. We are looking for patterns of:
16654 // !rev: 0 N 1 N+1 2 N+2 ...
16655 // rev: N 0 N+1 1 N+2 2 ...
16656 // The shuffle may either be a single source (in which case N = NumElts/2) or
16657 // two inputs extended with concat to the same size (in which case N =
16658 // NumElts).
16659 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16660 ArrayRef<int> M = SVN->getMask();
16661 unsigned NumElts = ToVT.getVectorNumElements();
16662 if (SVN->getOperand(1).isUndef())
16663 NumElts /= 2;
16664
16665 unsigned Off0 = Rev ? NumElts : 0;
16666 unsigned Off1 = Rev ? 0 : NumElts;
16667
16668 for (unsigned I = 0; I < NumElts; I += 2) {
16669 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16670 return false;
16671 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16672 return false;
16673 }
16674
16675 return true;
16676 };
16677
16678 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16679 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16680 return SDValue();
16681
16682 LLVMContext &C = *DAG.getContext();
16683 SDLoc DL(St);
16684 // Details about the old store
16685 SDValue Ch = St->getChain();
16686 SDValue BasePtr = St->getBasePtr();
16687 Align Alignment = St->getBaseAlign();
16689 AAMDNodes AAInfo = St->getAAInfo();
16690
16691 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16692 // and then stored as truncating integer stores.
16693 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16694 EVT NewToVT = EVT::getVectorVT(
16695 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16696
16698 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16699 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16700 SDValue NewPtr =
16701 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16702
16703 SDValue Extract =
16704 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16705 DAG.getConstant(i * NumElements, DL, MVT::i32));
16706
16707 SDValue FPTrunc =
16708 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16709 Extract, DAG.getConstant(0, DL, MVT::i32));
16710 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16711
16712 SDValue Store = DAG.getTruncStore(
16713 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16714 NewToVT, Alignment, MMOFlags, AAInfo);
16715 Stores.push_back(Store);
16716 }
16717 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16718}
16719
16720// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16721// into an expensive buildvector) and splitting it into a series of narrowing
16722// stores.
16724 SelectionDAG &DAG) {
16725 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16726 return SDValue();
16727 SDValue Trunc = St->getValue();
16728 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16729 return SDValue();
16730 EVT FromVT = Trunc->getOperand(0).getValueType();
16731 EVT ToVT = Trunc.getValueType();
16732
16733 LLVMContext &C = *DAG.getContext();
16734 SDLoc DL(St);
16735 // Details about the old store
16736 SDValue Ch = St->getChain();
16737 SDValue BasePtr = St->getBasePtr();
16738 Align Alignment = St->getBaseAlign();
16740 AAMDNodes AAInfo = St->getAAInfo();
16741
16742 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16743 FromVT.getVectorNumElements());
16744
16746 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16747 unsigned NewOffset =
16748 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16749 SDValue NewPtr =
16750 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16751
16752 SDValue Extract = Trunc.getOperand(i);
16753 SDValue Store = DAG.getTruncStore(
16754 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16755 NewToVT, Alignment, MMOFlags, AAInfo);
16756 Stores.push_back(Store);
16757 }
16758 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16759}
16760
16761// Given a floating point store from an extracted vector, with an integer
16762// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16763// help reduce fp register pressure, doesn't require the fp extract and allows
16764// use of more integer post-inc stores not available with vstr.
16766 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16767 return SDValue();
16768 SDValue Extract = St->getValue();
16769 EVT VT = Extract.getValueType();
16770 // For now only uses f16. This may be useful for f32 too, but that will
16771 // be bitcast(extract), not the VGETLANEu we currently check here.
16772 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16773 return SDValue();
16774
16775 SDNode *GetLane =
16776 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16777 {Extract.getOperand(0), Extract.getOperand(1)});
16778 if (!GetLane)
16779 return SDValue();
16780
16781 LLVMContext &C = *DAG.getContext();
16782 SDLoc DL(St);
16783 // Create a new integer store to replace the existing floating point version.
16784 SDValue Ch = St->getChain();
16785 SDValue BasePtr = St->getBasePtr();
16786 Align Alignment = St->getBaseAlign();
16788 AAMDNodes AAInfo = St->getAAInfo();
16789 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16790 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16791 St->getPointerInfo(), NewToVT, Alignment,
16792 MMOFlags, AAInfo);
16793
16794 return Store;
16795}
16796
16797/// PerformSTORECombine - Target-specific dag combine xforms for
16798/// ISD::STORE.
16801 const ARMSubtarget *Subtarget) {
16803 if (St->isVolatile())
16804 return SDValue();
16805 SDValue StVal = St->getValue();
16806 EVT VT = StVal.getValueType();
16807
16808 if (Subtarget->hasNEON())
16809 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16810 return Store;
16811
16812 if (Subtarget->hasMVEFloatOps())
16813 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16814 return NewToken;
16815
16816 if (Subtarget->hasMVEIntegerOps()) {
16817 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16818 return NewChain;
16819 if (SDValue NewToken =
16821 return NewToken;
16822 }
16823
16824 if (!ISD::isNormalStore(St))
16825 return SDValue();
16826
16827 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16828 // ARM stores of arguments in the same cache line.
16829 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16830 StVal.getNode()->hasOneUse()) {
16831 SelectionDAG &DAG = DCI.DAG;
16832 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16833 SDLoc DL(St);
16834 SDValue BasePtr = St->getBasePtr();
16835 SDValue NewST1 = DAG.getStore(
16836 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16837 BasePtr, St->getPointerInfo(), St->getBaseAlign(),
16838 St->getMemOperand()->getFlags());
16839
16840 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16841 DAG.getConstant(4, DL, MVT::i32));
16842 return DAG.getStore(NewST1.getValue(0), DL,
16843 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16844 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16845 St->getBaseAlign(), St->getMemOperand()->getFlags());
16846 }
16847
16848 if (StVal.getValueType() == MVT::i64 &&
16850
16851 // Bitcast an i64 store extracted from a vector to f64.
16852 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16853 SelectionDAG &DAG = DCI.DAG;
16854 SDLoc dl(StVal);
16855 SDValue IntVec = StVal.getOperand(0);
16856 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16858 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16859 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16860 Vec, StVal.getOperand(1));
16861 dl = SDLoc(N);
16862 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16863 // Make the DAGCombiner fold the bitcasts.
16864 DCI.AddToWorklist(Vec.getNode());
16865 DCI.AddToWorklist(ExtElt.getNode());
16866 DCI.AddToWorklist(V.getNode());
16867 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16868 St->getPointerInfo(), St->getAlign(),
16869 St->getMemOperand()->getFlags(), St->getAAInfo());
16870 }
16871
16872 // If this is a legal vector store, try to combine it into a VST1_UPD.
16873 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16875 return CombineBaseUpdate(N, DCI);
16876
16877 return SDValue();
16878}
16879
16880/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16881/// can replace combinations of VMUL and VCVT (floating-point to integer)
16882/// when the VMUL has a constant operand that is a power of 2.
16883///
16884/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16885/// vmul.f32 d16, d17, d16
16886/// vcvt.s32.f32 d16, d16
16887/// becomes:
16888/// vcvt.s32.f32 d16, d16, #3
16890 const ARMSubtarget *Subtarget) {
16891 if (!Subtarget->hasNEON())
16892 return SDValue();
16893
16894 SDValue Op = N->getOperand(0);
16895 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16896 Op.getOpcode() != ISD::FMUL)
16897 return SDValue();
16898
16899 SDValue ConstVec = Op->getOperand(1);
16900 if (!isa<BuildVectorSDNode>(ConstVec))
16901 return SDValue();
16902
16903 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16904 uint32_t FloatBits = FloatTy.getSizeInBits();
16905 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
16906 uint32_t IntBits = IntTy.getSizeInBits();
16907 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16908 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16909 // These instructions only exist converting from f32 to i32. We can handle
16910 // smaller integers by generating an extra truncate, but larger ones would
16911 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16912 // these intructions only support v2i32/v4i32 types.
16913 return SDValue();
16914 }
16915
16916 BitVector UndefElements;
16918 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16919 if (C == -1 || C == 0 || C > 32)
16920 return SDValue();
16921
16922 SDLoc dl(N);
16923 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16924 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16925 Intrinsic::arm_neon_vcvtfp2fxu;
16926 SDValue FixConv = DAG.getNode(
16927 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16928 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16929 DAG.getConstant(C, dl, MVT::i32));
16930
16931 if (IntBits < FloatBits)
16932 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
16933
16934 return FixConv;
16935}
16936
16938 const ARMSubtarget *Subtarget) {
16939 if (!Subtarget->hasMVEFloatOps())
16940 return SDValue();
16941
16942 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
16943 // The second form can be more easily turned into a predicated vadd, and
16944 // possibly combined into a fma to become a predicated vfma.
16945 SDValue Op0 = N->getOperand(0);
16946 SDValue Op1 = N->getOperand(1);
16947 EVT VT = N->getValueType(0);
16948 SDLoc DL(N);
16949
16950 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
16951 // which these VMOV's represent.
16952 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
16953 if (Op.getOpcode() != ISD::BITCAST ||
16954 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
16955 return false;
16956 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
16957 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
16958 return true;
16959 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
16960 return true;
16961 return false;
16962 };
16963
16964 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
16965 std::swap(Op0, Op1);
16966
16967 if (Op1.getOpcode() != ISD::VSELECT)
16968 return SDValue();
16969
16970 SDNodeFlags FaddFlags = N->getFlags();
16971 bool NSZ = FaddFlags.hasNoSignedZeros();
16972 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
16973 return SDValue();
16974
16975 SDValue FAdd =
16976 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
16977 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
16978}
16979
16981 SDValue LHS = N->getOperand(0);
16982 SDValue RHS = N->getOperand(1);
16983 EVT VT = N->getValueType(0);
16984 SDLoc DL(N);
16985
16986 if (!N->getFlags().hasAllowReassociation())
16987 return SDValue();
16988
16989 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
16990 auto ReassocComplex = [&](SDValue A, SDValue B) {
16991 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
16992 return SDValue();
16993 unsigned Opc = A.getConstantOperandVal(0);
16994 if (Opc != Intrinsic::arm_mve_vcmlaq)
16995 return SDValue();
16996 SDValue VCMLA = DAG.getNode(
16997 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
16998 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
16999 A.getOperand(3), A.getOperand(4));
17000 VCMLA->setFlags(A->getFlags());
17001 return VCMLA;
17002 };
17003 if (SDValue R = ReassocComplex(LHS, RHS))
17004 return R;
17005 if (SDValue R = ReassocComplex(RHS, LHS))
17006 return R;
17007
17008 return SDValue();
17009}
17010
17012 const ARMSubtarget *Subtarget) {
17013 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
17014 return S;
17015 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
17016 return S;
17017 return SDValue();
17018}
17019
17020/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
17021/// can replace combinations of VCVT (integer to floating-point) and VMUL
17022/// when the VMUL has a constant operand that is a power of 2.
17023///
17024/// Example (assume d17 = <float 0.125, float 0.125>):
17025/// vcvt.f32.s32 d16, d16
17026/// vmul.f32 d16, d16, d17
17027/// becomes:
17028/// vcvt.f32.s32 d16, d16, #3
17030 const ARMSubtarget *Subtarget) {
17031 if (!Subtarget->hasNEON())
17032 return SDValue();
17033
17034 SDValue Op = N->getOperand(0);
17035 unsigned OpOpcode = Op.getNode()->getOpcode();
17036 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
17037 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
17038 return SDValue();
17039
17040 SDValue ConstVec = N->getOperand(1);
17041 if (!isa<BuildVectorSDNode>(ConstVec))
17042 return SDValue();
17043
17044 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17045 uint32_t FloatBits = FloatTy.getSizeInBits();
17046 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17047 uint32_t IntBits = IntTy.getSizeInBits();
17048 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17049 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17050 // These instructions only exist converting from i32 to f32. We can handle
17051 // smaller integers by generating an extra extend, but larger ones would
17052 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17053 // these intructions only support v2i32/v4i32 types.
17054 return SDValue();
17055 }
17056
17057 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
17058 APFloat Recip(0.0f);
17059 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
17060 return SDValue();
17061
17062 bool IsExact;
17063 APSInt IntVal(33);
17064 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
17065 APFloat::opOK ||
17066 !IsExact)
17067 return SDValue();
17068
17069 int32_t C = IntVal.exactLogBase2();
17070 if (C == -1 || C == 0 || C > 32)
17071 return SDValue();
17072
17073 SDLoc DL(N);
17074 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
17075 SDValue ConvInput = Op.getOperand(0);
17076 if (IntBits < FloatBits)
17078 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
17079
17080 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
17081 : Intrinsic::arm_neon_vcvtfxu2fp;
17082 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17083 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17084 DAG.getConstant(C, DL, MVT::i32));
17085}
17086
17088 const ARMSubtarget *ST) {
17089 if (!ST->hasMVEIntegerOps())
17090 return SDValue();
17091
17092 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
17093 EVT ResVT = N->getValueType(0);
17094 SDValue N0 = N->getOperand(0);
17095 SDLoc dl(N);
17096
17097 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
17098 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
17099 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
17100 N0.getValueType() == MVT::v16i8)) {
17101 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
17102 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
17103 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
17104 }
17105
17106 // We are looking for something that will have illegal types if left alone,
17107 // but that we can convert to a single instruction under MVE. For example
17108 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
17109 // or
17110 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
17111
17112 // The legal cases are:
17113 // VADDV u/s 8/16/32
17114 // VMLAV u/s 8/16/32
17115 // VADDLV u/s 32
17116 // VMLALV u/s 16/32
17117
17118 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
17119 // extend it and use v4i32 instead.
17120 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
17121 EVT AVT = A.getValueType();
17122 return any_of(ExtTypes, [&](MVT Ty) {
17123 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
17124 AVT.bitsLE(Ty);
17125 });
17126 };
17127 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
17128 EVT AVT = A.getValueType();
17129 if (!AVT.is128BitVector())
17130 A = DAG.getNode(ExtendCode, dl,
17132 128 / AVT.getVectorMinNumElements())),
17133 A);
17134 return A;
17135 };
17136 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
17137 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
17138 return SDValue();
17139 SDValue A = N0->getOperand(0);
17140 if (ExtTypeMatches(A, ExtTypes))
17141 return ExtendIfNeeded(A, ExtendCode);
17142 return SDValue();
17143 };
17144 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
17145 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
17146 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17148 return SDValue();
17149 Mask = N0->getOperand(0);
17150 SDValue Ext = N0->getOperand(1);
17151 if (Ext->getOpcode() != ExtendCode)
17152 return SDValue();
17153 SDValue A = Ext->getOperand(0);
17154 if (ExtTypeMatches(A, ExtTypes))
17155 return ExtendIfNeeded(A, ExtendCode);
17156 return SDValue();
17157 };
17158 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17159 SDValue &A, SDValue &B) {
17160 // For a vmla we are trying to match a larger pattern:
17161 // ExtA = sext/zext A
17162 // ExtB = sext/zext B
17163 // Mul = mul ExtA, ExtB
17164 // vecreduce.add Mul
17165 // There might also be en extra extend between the mul and the addreduce, so
17166 // long as the bitwidth is high enough to make them equivalent (for example
17167 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
17168 if (ResVT != RetTy)
17169 return false;
17170 SDValue Mul = N0;
17171 if (Mul->getOpcode() == ExtendCode &&
17172 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17173 ResVT.getScalarSizeInBits())
17174 Mul = Mul->getOperand(0);
17175 if (Mul->getOpcode() != ISD::MUL)
17176 return false;
17177 SDValue ExtA = Mul->getOperand(0);
17178 SDValue ExtB = Mul->getOperand(1);
17179 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17180 return false;
17181 A = ExtA->getOperand(0);
17182 B = ExtB->getOperand(0);
17183 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17184 A = ExtendIfNeeded(A, ExtendCode);
17185 B = ExtendIfNeeded(B, ExtendCode);
17186 return true;
17187 }
17188 return false;
17189 };
17190 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17191 SDValue &A, SDValue &B, SDValue &Mask) {
17192 // Same as the pattern above with a select for the zero predicated lanes
17193 // ExtA = sext/zext A
17194 // ExtB = sext/zext B
17195 // Mul = mul ExtA, ExtB
17196 // N0 = select Mask, Mul, 0
17197 // vecreduce.add N0
17198 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17200 return false;
17201 Mask = N0->getOperand(0);
17202 SDValue Mul = N0->getOperand(1);
17203 if (Mul->getOpcode() == ExtendCode &&
17204 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17205 ResVT.getScalarSizeInBits())
17206 Mul = Mul->getOperand(0);
17207 if (Mul->getOpcode() != ISD::MUL)
17208 return false;
17209 SDValue ExtA = Mul->getOperand(0);
17210 SDValue ExtB = Mul->getOperand(1);
17211 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17212 return false;
17213 A = ExtA->getOperand(0);
17214 B = ExtB->getOperand(0);
17215 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17216 A = ExtendIfNeeded(A, ExtendCode);
17217 B = ExtendIfNeeded(B, ExtendCode);
17218 return true;
17219 }
17220 return false;
17221 };
17222 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17223 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17224 // reductions. The operands are extended with MVEEXT, but as they are
17225 // reductions the lane orders do not matter. MVEEXT may be combined with
17226 // loads to produce two extending loads, or else they will be expanded to
17227 // VREV/VMOVL.
17228 EVT VT = Ops[0].getValueType();
17229 if (VT == MVT::v16i8) {
17230 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17231 "Unexpected illegal long reduction opcode");
17232 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17233
17234 SDValue Ext0 =
17235 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17236 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17237 SDValue Ext1 =
17238 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17239 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17240
17241 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17242 Ext0, Ext1);
17243 SDValue MLA1 =
17244 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17245 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17246 Ext0.getValue(1), Ext1.getValue(1));
17247 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17248 }
17249 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17250 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17251 SDValue(Node.getNode(), 1));
17252 };
17253
17254 SDValue A, B;
17255 SDValue Mask;
17256 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17257 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17258 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17259 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17260 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17261 A, B))
17262 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17263 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17264 A, B))
17265 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17266 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17267 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17268 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17269 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17270 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17271 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17272
17273 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17274 Mask))
17275 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17276 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17277 Mask))
17278 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17279 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17280 Mask))
17281 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17282 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17283 Mask))
17284 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17285 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17286 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17287 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17288 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17289 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17290 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17291
17292 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17293 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17294 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17295 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17296 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17297 return Create64bitNode(ARMISD::VADDLVs, {A});
17298 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17299 return Create64bitNode(ARMISD::VADDLVu, {A});
17300 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17301 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17302 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17303 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17304 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17305 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17306
17307 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17308 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17309 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17310 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17311 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17312 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17313 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17314 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17315 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17316 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17317 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17318 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17319 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17320 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17321
17322 // Some complications. We can get a case where the two inputs of the mul are
17323 // the same, then the output sext will have been helpfully converted to a
17324 // zext. Turn it back.
17325 SDValue Op = N0;
17326 if (Op->getOpcode() == ISD::VSELECT)
17327 Op = Op->getOperand(1);
17328 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17329 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17330 SDValue Mul = Op->getOperand(0);
17331 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17332 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17333 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17334 if (Op != N0)
17335 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17336 N0->getOperand(0), Ext, N0->getOperand(2));
17337 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17338 }
17339 }
17340
17341 return SDValue();
17342}
17343
17344// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17345// the lanes are used. Due to the reduction being commutative the shuffle can be
17346// removed.
17348 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17349 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17350 if (!Shuf || !Shuf->getOperand(1).isUndef())
17351 return SDValue();
17352
17353 // Check all elements are used once in the mask.
17354 ArrayRef<int> Mask = Shuf->getMask();
17355 APInt SetElts(Mask.size(), 0);
17356 for (int E : Mask) {
17357 if (E < 0 || E >= (int)Mask.size())
17358 return SDValue();
17359 SetElts.setBit(E);
17360 }
17361 if (!SetElts.isAllOnes())
17362 return SDValue();
17363
17364 if (N->getNumOperands() != VecOp + 1) {
17365 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17366 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17367 return SDValue();
17368 }
17369
17371 for (SDValue Op : N->ops()) {
17372 if (Op.getValueType().isVector())
17373 Ops.push_back(Op.getOperand(0));
17374 else
17375 Ops.push_back(Op);
17376 }
17377 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17378}
17379
17382 SDValue Op0 = N->getOperand(0);
17383 SDValue Op1 = N->getOperand(1);
17384 unsigned IsTop = N->getConstantOperandVal(2);
17385
17386 // VMOVNT a undef -> a
17387 // VMOVNB a undef -> a
17388 // VMOVNB undef a -> a
17389 if (Op1->isUndef())
17390 return Op0;
17391 if (Op0->isUndef() && !IsTop)
17392 return Op1;
17393
17394 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17395 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17396 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17397 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17398 Op1->getConstantOperandVal(2) == 0)
17399 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17400 Op0, Op1->getOperand(1), N->getOperand(2));
17401
17402 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17403 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17404 // into the top or bottom lanes.
17405 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17406 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17407 APInt Op0DemandedElts =
17408 IsTop ? Op1DemandedElts
17409 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17410
17411 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17412 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17413 return SDValue(N, 0);
17414 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17415 return SDValue(N, 0);
17416
17417 return SDValue();
17418}
17419
17422 SDValue Op0 = N->getOperand(0);
17423 unsigned IsTop = N->getConstantOperandVal(2);
17424
17425 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17426 APInt Op0DemandedElts =
17427 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17428 : APInt::getHighBitsSet(2, 1));
17429
17430 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17431 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17432 return SDValue(N, 0);
17433 return SDValue();
17434}
17435
17438 EVT VT = N->getValueType(0);
17439 SDValue LHS = N->getOperand(0);
17440 SDValue RHS = N->getOperand(1);
17441
17442 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17443 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17444 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17445 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17446 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17447 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17448 SDLoc DL(N);
17449 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17450 LHS.getOperand(0), RHS.getOperand(0));
17451 SDValue UndefV = LHS.getOperand(1);
17452 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17453 }
17454 return SDValue();
17455}
17456
17458 SDLoc DL(N);
17459 SDValue Op0 = N->getOperand(0);
17460 SDValue Op1 = N->getOperand(1);
17461
17462 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17463 // uses of the intrinsics.
17464 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17465 int ShiftAmt = C->getSExtValue();
17466 if (ShiftAmt == 0) {
17467 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17468 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17469 return SDValue();
17470 }
17471
17472 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17473 unsigned NewOpcode =
17474 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17475 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17476 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17477 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17478 return NewShift;
17479 }
17480 }
17481
17482 return SDValue();
17483}
17484
17485/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17487 DAGCombinerInfo &DCI) const {
17488 SelectionDAG &DAG = DCI.DAG;
17489 unsigned IntNo = N->getConstantOperandVal(0);
17490 switch (IntNo) {
17491 default:
17492 // Don't do anything for most intrinsics.
17493 break;
17494
17495 // Vector shifts: check for immediate versions and lower them.
17496 // Note: This is done during DAG combining instead of DAG legalizing because
17497 // the build_vectors for 64-bit vector element shift counts are generally
17498 // not legal, and it is hard to see their values after they get legalized to
17499 // loads from a constant pool.
17500 case Intrinsic::arm_neon_vshifts:
17501 case Intrinsic::arm_neon_vshiftu:
17502 case Intrinsic::arm_neon_vrshifts:
17503 case Intrinsic::arm_neon_vrshiftu:
17504 case Intrinsic::arm_neon_vrshiftn:
17505 case Intrinsic::arm_neon_vqshifts:
17506 case Intrinsic::arm_neon_vqshiftu:
17507 case Intrinsic::arm_neon_vqshiftsu:
17508 case Intrinsic::arm_neon_vqshiftns:
17509 case Intrinsic::arm_neon_vqshiftnu:
17510 case Intrinsic::arm_neon_vqshiftnsu:
17511 case Intrinsic::arm_neon_vqrshiftns:
17512 case Intrinsic::arm_neon_vqrshiftnu:
17513 case Intrinsic::arm_neon_vqrshiftnsu: {
17514 EVT VT = N->getOperand(1).getValueType();
17515 int64_t Cnt;
17516 unsigned VShiftOpc = 0;
17517
17518 switch (IntNo) {
17519 case Intrinsic::arm_neon_vshifts:
17520 case Intrinsic::arm_neon_vshiftu:
17521 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17522 VShiftOpc = ARMISD::VSHLIMM;
17523 break;
17524 }
17525 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17526 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17528 break;
17529 }
17530 return SDValue();
17531
17532 case Intrinsic::arm_neon_vrshifts:
17533 case Intrinsic::arm_neon_vrshiftu:
17534 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17535 break;
17536 return SDValue();
17537
17538 case Intrinsic::arm_neon_vqshifts:
17539 case Intrinsic::arm_neon_vqshiftu:
17540 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17541 break;
17542 return SDValue();
17543
17544 case Intrinsic::arm_neon_vqshiftsu:
17545 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17546 break;
17547 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17548
17549 case Intrinsic::arm_neon_vrshiftn:
17550 case Intrinsic::arm_neon_vqshiftns:
17551 case Intrinsic::arm_neon_vqshiftnu:
17552 case Intrinsic::arm_neon_vqshiftnsu:
17553 case Intrinsic::arm_neon_vqrshiftns:
17554 case Intrinsic::arm_neon_vqrshiftnu:
17555 case Intrinsic::arm_neon_vqrshiftnsu:
17556 // Narrowing shifts require an immediate right shift.
17557 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17558 break;
17559 llvm_unreachable("invalid shift count for narrowing vector shift "
17560 "intrinsic");
17561
17562 default:
17563 llvm_unreachable("unhandled vector shift");
17564 }
17565
17566 switch (IntNo) {
17567 case Intrinsic::arm_neon_vshifts:
17568 case Intrinsic::arm_neon_vshiftu:
17569 // Opcode already set above.
17570 break;
17571 case Intrinsic::arm_neon_vrshifts:
17572 VShiftOpc = ARMISD::VRSHRsIMM;
17573 break;
17574 case Intrinsic::arm_neon_vrshiftu:
17575 VShiftOpc = ARMISD::VRSHRuIMM;
17576 break;
17577 case Intrinsic::arm_neon_vrshiftn:
17578 VShiftOpc = ARMISD::VRSHRNIMM;
17579 break;
17580 case Intrinsic::arm_neon_vqshifts:
17581 VShiftOpc = ARMISD::VQSHLsIMM;
17582 break;
17583 case Intrinsic::arm_neon_vqshiftu:
17584 VShiftOpc = ARMISD::VQSHLuIMM;
17585 break;
17586 case Intrinsic::arm_neon_vqshiftsu:
17587 VShiftOpc = ARMISD::VQSHLsuIMM;
17588 break;
17589 case Intrinsic::arm_neon_vqshiftns:
17590 VShiftOpc = ARMISD::VQSHRNsIMM;
17591 break;
17592 case Intrinsic::arm_neon_vqshiftnu:
17593 VShiftOpc = ARMISD::VQSHRNuIMM;
17594 break;
17595 case Intrinsic::arm_neon_vqshiftnsu:
17596 VShiftOpc = ARMISD::VQSHRNsuIMM;
17597 break;
17598 case Intrinsic::arm_neon_vqrshiftns:
17599 VShiftOpc = ARMISD::VQRSHRNsIMM;
17600 break;
17601 case Intrinsic::arm_neon_vqrshiftnu:
17602 VShiftOpc = ARMISD::VQRSHRNuIMM;
17603 break;
17604 case Intrinsic::arm_neon_vqrshiftnsu:
17605 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17606 break;
17607 }
17608
17609 SDLoc dl(N);
17610 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17611 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17612 }
17613
17614 case Intrinsic::arm_neon_vshiftins: {
17615 EVT VT = N->getOperand(1).getValueType();
17616 int64_t Cnt;
17617 unsigned VShiftOpc = 0;
17618
17619 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17620 VShiftOpc = ARMISD::VSLIIMM;
17621 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17622 VShiftOpc = ARMISD::VSRIIMM;
17623 else {
17624 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17625 }
17626
17627 SDLoc dl(N);
17628 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17629 N->getOperand(1), N->getOperand(2),
17630 DAG.getConstant(Cnt, dl, MVT::i32));
17631 }
17632
17633 case Intrinsic::arm_neon_vqrshifts:
17634 case Intrinsic::arm_neon_vqrshiftu:
17635 // No immediate versions of these to check for.
17636 break;
17637
17638 case Intrinsic::arm_neon_vbsl: {
17639 SDLoc dl(N);
17640 return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
17641 N->getOperand(2), N->getOperand(3));
17642 }
17643 case Intrinsic::arm_mve_vqdmlah:
17644 case Intrinsic::arm_mve_vqdmlash:
17645 case Intrinsic::arm_mve_vqrdmlah:
17646 case Intrinsic::arm_mve_vqrdmlash:
17647 case Intrinsic::arm_mve_vmla_n_predicated:
17648 case Intrinsic::arm_mve_vmlas_n_predicated:
17649 case Intrinsic::arm_mve_vqdmlah_predicated:
17650 case Intrinsic::arm_mve_vqdmlash_predicated:
17651 case Intrinsic::arm_mve_vqrdmlah_predicated:
17652 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17653 // These intrinsics all take an i32 scalar operand which is narrowed to the
17654 // size of a single lane of the vector type they return. So we don't need
17655 // any bits of that operand above that point, which allows us to eliminate
17656 // uxth/sxth.
17657 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17658 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17659 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17660 return SDValue();
17661 break;
17662 }
17663
17664 case Intrinsic::arm_mve_minv:
17665 case Intrinsic::arm_mve_maxv:
17666 case Intrinsic::arm_mve_minav:
17667 case Intrinsic::arm_mve_maxav:
17668 case Intrinsic::arm_mve_minv_predicated:
17669 case Intrinsic::arm_mve_maxv_predicated:
17670 case Intrinsic::arm_mve_minav_predicated:
17671 case Intrinsic::arm_mve_maxav_predicated: {
17672 // These intrinsics all take an i32 scalar operand which is narrowed to the
17673 // size of a single lane of the vector type they take as the other input.
17674 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17675 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17676 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17677 return SDValue();
17678 break;
17679 }
17680
17681 case Intrinsic::arm_mve_addv: {
17682 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17683 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17684 bool Unsigned = N->getConstantOperandVal(2);
17686 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17687 }
17688
17689 case Intrinsic::arm_mve_addlv:
17690 case Intrinsic::arm_mve_addlv_predicated: {
17691 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17692 // which recombines the two outputs into an i64
17693 bool Unsigned = N->getConstantOperandVal(2);
17694 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17697
17699 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17700 if (i != 2) // skip the unsigned flag
17701 Ops.push_back(N->getOperand(i));
17702
17703 SDLoc dl(N);
17704 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17705 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17706 val.getValue(1));
17707 }
17708 }
17709
17710 return SDValue();
17711}
17712
17713/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17714/// lowers them. As with the vector shift intrinsics, this is done during DAG
17715/// combining instead of DAG legalizing because the build_vectors for 64-bit
17716/// vector element shift counts are generally not legal, and it is hard to see
17717/// their values after they get legalized to loads from a constant pool.
17720 const ARMSubtarget *ST) {
17721 SelectionDAG &DAG = DCI.DAG;
17722 EVT VT = N->getValueType(0);
17723
17724 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17725 N->getOperand(0)->getOpcode() == ISD::AND &&
17726 N->getOperand(0)->hasOneUse()) {
17727 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17728 return SDValue();
17729 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17730 // usually show up because instcombine prefers to canonicalize it to
17731 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17732 // out of GEP lowering in some cases.
17733 SDValue N0 = N->getOperand(0);
17734 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17735 if (!ShiftAmtNode)
17736 return SDValue();
17737 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17738 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17739 if (!AndMaskNode)
17740 return SDValue();
17741 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17742 // Don't transform uxtb/uxth.
17743 if (AndMask == 255 || AndMask == 65535)
17744 return SDValue();
17745 if (isMask_32(AndMask)) {
17746 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17747 if (MaskedBits > ShiftAmt) {
17748 SDLoc DL(N);
17749 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17750 DAG.getConstant(MaskedBits, DL, MVT::i32));
17751 return DAG.getNode(
17752 ISD::SRL, DL, MVT::i32, SHL,
17753 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17754 }
17755 }
17756 }
17757
17758 // Nothing to be done for scalar shifts.
17759 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17760 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17761 return SDValue();
17762 if (ST->hasMVEIntegerOps())
17763 return SDValue();
17764
17765 int64_t Cnt;
17766
17767 switch (N->getOpcode()) {
17768 default: llvm_unreachable("unexpected shift opcode");
17769
17770 case ISD::SHL:
17771 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17772 SDLoc dl(N);
17773 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17774 DAG.getConstant(Cnt, dl, MVT::i32));
17775 }
17776 break;
17777
17778 case ISD::SRA:
17779 case ISD::SRL:
17780 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17781 unsigned VShiftOpc =
17782 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17783 SDLoc dl(N);
17784 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17785 DAG.getConstant(Cnt, dl, MVT::i32));
17786 }
17787 }
17788 return SDValue();
17789}
17790
17791// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17792// split into multiple extending loads, which are simpler to deal with than an
17793// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17794// to convert the type to an f32.
17796 SDValue N0 = N->getOperand(0);
17797 if (N0.getOpcode() != ISD::LOAD)
17798 return SDValue();
17800 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17801 LD->getExtensionType() != ISD::NON_EXTLOAD)
17802 return SDValue();
17803 EVT FromVT = LD->getValueType(0);
17804 EVT ToVT = N->getValueType(0);
17805 if (!ToVT.isVector())
17806 return SDValue();
17808 EVT ToEltVT = ToVT.getVectorElementType();
17809 EVT FromEltVT = FromVT.getVectorElementType();
17810
17811 unsigned NumElements = 0;
17812 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17813 NumElements = 4;
17814 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17815 NumElements = 4;
17816 if (NumElements == 0 ||
17817 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17818 FromVT.getVectorNumElements() % NumElements != 0 ||
17819 !isPowerOf2_32(NumElements))
17820 return SDValue();
17821
17822 LLVMContext &C = *DAG.getContext();
17823 SDLoc DL(LD);
17824 // Details about the old load
17825 SDValue Ch = LD->getChain();
17826 SDValue BasePtr = LD->getBasePtr();
17827 Align Alignment = LD->getBaseAlign();
17828 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17829 AAMDNodes AAInfo = LD->getAAInfo();
17830
17831 ISD::LoadExtType NewExtType =
17832 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17833 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17834 EVT NewFromVT = EVT::getVectorVT(
17835 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17836 EVT NewToVT = EVT::getVectorVT(
17837 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17838
17841 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17842 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17843 SDValue NewPtr =
17844 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17845
17846 SDValue NewLoad =
17847 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17848 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17849 Alignment, MMOFlags, AAInfo);
17850 Loads.push_back(NewLoad);
17851 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17852 }
17853
17854 // Float truncs need to extended with VCVTB's into their floating point types.
17855 if (FromEltVT == MVT::f16) {
17857
17858 for (unsigned i = 0; i < Loads.size(); i++) {
17859 SDValue LoadBC =
17860 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17861 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17862 DAG.getConstant(0, DL, MVT::i32));
17863 Extends.push_back(FPExt);
17864 }
17865
17866 Loads = Extends;
17867 }
17868
17869 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17870 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17871 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
17872}
17873
17874/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
17875/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
17877 const ARMSubtarget *ST) {
17878 SDValue N0 = N->getOperand(0);
17879
17880 // Check for sign- and zero-extensions of vector extract operations of 8- and
17881 // 16-bit vector elements. NEON and MVE support these directly. They are
17882 // handled during DAG combining because type legalization will promote them
17883 // to 32-bit types and it is messy to recognize the operations after that.
17884 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
17886 SDValue Vec = N0.getOperand(0);
17887 SDValue Lane = N0.getOperand(1);
17888 EVT VT = N->getValueType(0);
17889 EVT EltVT = N0.getValueType();
17890 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17891
17892 if (VT == MVT::i32 &&
17893 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
17894 TLI.isTypeLegal(Vec.getValueType()) &&
17895 isa<ConstantSDNode>(Lane)) {
17896
17897 unsigned Opc = 0;
17898 switch (N->getOpcode()) {
17899 default: llvm_unreachable("unexpected opcode");
17900 case ISD::SIGN_EXTEND:
17902 break;
17903 case ISD::ZERO_EXTEND:
17904 case ISD::ANY_EXTEND:
17906 break;
17907 }
17908 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
17909 }
17910 }
17911
17912 if (ST->hasMVEIntegerOps())
17913 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17914 return NewLoad;
17915
17916 return SDValue();
17917}
17918
17920 const ARMSubtarget *ST) {
17921 if (ST->hasMVEFloatOps())
17922 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17923 return NewLoad;
17924
17925 return SDValue();
17926}
17927
17928// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
17929// constant bounds.
17931 const ARMSubtarget *Subtarget) {
17932 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
17933 !Subtarget->isThumb2())
17934 return SDValue();
17935
17936 EVT VT = Op.getValueType();
17937 SDValue Op0 = Op.getOperand(0);
17938
17939 if (VT != MVT::i32 ||
17940 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
17941 !isa<ConstantSDNode>(Op.getOperand(1)) ||
17943 return SDValue();
17944
17945 SDValue Min = Op;
17946 SDValue Max = Op0;
17947 SDValue Input = Op0.getOperand(0);
17948 if (Min.getOpcode() == ISD::SMAX)
17949 std::swap(Min, Max);
17950
17951 APInt MinC = Min.getConstantOperandAPInt(1);
17952 APInt MaxC = Max.getConstantOperandAPInt(1);
17953
17954 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
17955 !(MinC + 1).isPowerOf2())
17956 return SDValue();
17957
17958 SDLoc DL(Op);
17959 if (MinC == ~MaxC)
17960 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
17961 DAG.getConstant(MinC.countr_one(), DL, VT));
17962 if (MaxC == 0)
17963 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
17964 DAG.getConstant(MinC.countr_one(), DL, VT));
17965
17966 return SDValue();
17967}
17968
17969/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
17970/// saturates.
17972 const ARMSubtarget *ST) {
17973 EVT VT = N->getValueType(0);
17974 SDValue N0 = N->getOperand(0);
17975
17976 if (VT == MVT::i32)
17977 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
17978
17979 if (!ST->hasMVEIntegerOps())
17980 return SDValue();
17981
17982 if (SDValue V = PerformVQDMULHCombine(N, DAG))
17983 return V;
17984
17985 if (VT != MVT::v4i32 && VT != MVT::v8i16)
17986 return SDValue();
17987
17988 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
17989 // Check one is a smin and the other is a smax
17990 if (Min->getOpcode() != ISD::SMIN)
17991 std::swap(Min, Max);
17992 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
17993 return false;
17994
17995 APInt SaturateC;
17996 if (VT == MVT::v4i32)
17997 SaturateC = APInt(32, (1 << 15) - 1, true);
17998 else //if (VT == MVT::v8i16)
17999 SaturateC = APInt(16, (1 << 7) - 1, true);
18000
18001 APInt MinC, MaxC;
18002 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18003 MinC != SaturateC)
18004 return false;
18005 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
18006 MaxC != ~SaturateC)
18007 return false;
18008 return true;
18009 };
18010
18011 if (IsSignedSaturate(N, N0.getNode())) {
18012 SDLoc DL(N);
18013 MVT ExtVT, HalfVT;
18014 if (VT == MVT::v4i32) {
18015 HalfVT = MVT::v8i16;
18016 ExtVT = MVT::v4i16;
18017 } else { // if (VT == MVT::v8i16)
18018 HalfVT = MVT::v16i8;
18019 ExtVT = MVT::v8i8;
18020 }
18021
18022 // Create a VQMOVNB with undef top lanes, then signed extended into the top
18023 // half. That extend will hopefully be removed if only the bottom bits are
18024 // demanded (though a truncating store, for example).
18025 SDValue VQMOVN =
18026 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
18027 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
18028 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18029 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
18030 DAG.getValueType(ExtVT));
18031 }
18032
18033 auto IsUnsignedSaturate = [&](SDNode *Min) {
18034 // For unsigned, we just need to check for <= 0xffff
18035 if (Min->getOpcode() != ISD::UMIN)
18036 return false;
18037
18038 APInt SaturateC;
18039 if (VT == MVT::v4i32)
18040 SaturateC = APInt(32, (1 << 16) - 1, true);
18041 else //if (VT == MVT::v8i16)
18042 SaturateC = APInt(16, (1 << 8) - 1, true);
18043
18044 APInt MinC;
18045 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18046 MinC != SaturateC)
18047 return false;
18048 return true;
18049 };
18050
18051 if (IsUnsignedSaturate(N)) {
18052 SDLoc DL(N);
18053 MVT HalfVT;
18054 unsigned ExtConst;
18055 if (VT == MVT::v4i32) {
18056 HalfVT = MVT::v8i16;
18057 ExtConst = 0x0000FFFF;
18058 } else { //if (VT == MVT::v8i16)
18059 HalfVT = MVT::v16i8;
18060 ExtConst = 0x00FF;
18061 }
18062
18063 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
18064 // an AND. That extend will hopefully be removed if only the bottom bits are
18065 // demanded (though a truncating store, for example).
18066 SDValue VQMOVN =
18067 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
18068 DAG.getConstant(0, DL, MVT::i32));
18069 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18070 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
18071 DAG.getConstant(ExtConst, DL, VT));
18072 }
18073
18074 return SDValue();
18075}
18076
18079 if (!C)
18080 return nullptr;
18081 const APInt *CV = &C->getAPIntValue();
18082 return CV->isPowerOf2() ? CV : nullptr;
18083}
18084
18086 // If we have a CMOV, OR and AND combination such as:
18087 // if (x & CN)
18088 // y |= CM;
18089 //
18090 // And:
18091 // * CN is a single bit;
18092 // * All bits covered by CM are known zero in y
18093 //
18094 // Then we can convert this into a sequence of BFI instructions. This will
18095 // always be a win if CM is a single bit, will always be no worse than the
18096 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
18097 // three bits (due to the extra IT instruction).
18098
18099 SDValue Op0 = CMOV->getOperand(0);
18100 SDValue Op1 = CMOV->getOperand(1);
18101 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
18102 SDValue CmpZ = CMOV->getOperand(3);
18103
18104 // The compare must be against zero.
18105 if (!isNullConstant(CmpZ->getOperand(1)))
18106 return SDValue();
18107
18108 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
18109 SDValue And = CmpZ->getOperand(0);
18110 if (And->getOpcode() != ISD::AND)
18111 return SDValue();
18112 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
18113 if (!AndC)
18114 return SDValue();
18115 SDValue X = And->getOperand(0);
18116
18117 if (CC == ARMCC::EQ) {
18118 // We're performing an "equal to zero" compare. Swap the operands so we
18119 // canonicalize on a "not equal to zero" compare.
18120 std::swap(Op0, Op1);
18121 } else {
18122 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
18123 }
18124
18125 if (Op1->getOpcode() != ISD::OR)
18126 return SDValue();
18127
18129 if (!OrC)
18130 return SDValue();
18131 SDValue Y = Op1->getOperand(0);
18132
18133 if (Op0 != Y)
18134 return SDValue();
18135
18136 // Now, is it profitable to continue?
18137 APInt OrCI = OrC->getAPIntValue();
18138 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
18139 if (OrCI.popcount() > Heuristic)
18140 return SDValue();
18141
18142 // Lastly, can we determine that the bits defined by OrCI
18143 // are zero in Y?
18144 KnownBits Known = DAG.computeKnownBits(Y);
18145 if ((OrCI & Known.Zero) != OrCI)
18146 return SDValue();
18147
18148 // OK, we can do the combine.
18149 SDValue V = Y;
18150 SDLoc dl(X);
18151 EVT VT = X.getValueType();
18152 unsigned BitInX = AndC->logBase2();
18153
18154 if (BitInX != 0) {
18155 // We must shift X first.
18156 X = DAG.getNode(ISD::SRL, dl, VT, X,
18157 DAG.getConstant(BitInX, dl, VT));
18158 }
18159
18160 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
18161 BitInY < NumActiveBits; ++BitInY) {
18162 if (OrCI[BitInY] == 0)
18163 continue;
18164 APInt Mask(VT.getSizeInBits(), 0);
18165 Mask.setBit(BitInY);
18166 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
18167 // Confusingly, the operand is an *inverted* mask.
18168 DAG.getConstant(~Mask, dl, VT));
18169 }
18170
18171 return V;
18172}
18173
18174// Given N, the value controlling the conditional branch, search for the loop
18175// intrinsic, returning it, along with how the value is used. We need to handle
18176// patterns such as the following:
18177// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
18178// (brcond (setcc (loop.decrement), 0, eq), exit)
18179// (brcond (setcc (loop.decrement), 0, ne), header)
18181 bool &Negate) {
18182 switch (N->getOpcode()) {
18183 default:
18184 break;
18185 case ISD::XOR: {
18186 if (!isa<ConstantSDNode>(N.getOperand(1)))
18187 return SDValue();
18188 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
18189 return SDValue();
18190 Negate = !Negate;
18191 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
18192 }
18193 case ISD::SETCC: {
18194 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
18195 if (!Const)
18196 return SDValue();
18197 if (Const->isZero())
18198 Imm = 0;
18199 else if (Const->isOne())
18200 Imm = 1;
18201 else
18202 return SDValue();
18203 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18204 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18205 }
18207 unsigned IntOp = N.getConstantOperandVal(1);
18208 if (IntOp != Intrinsic::test_start_loop_iterations &&
18209 IntOp != Intrinsic::loop_decrement_reg)
18210 return SDValue();
18211 return N;
18212 }
18213 }
18214 return SDValue();
18215}
18216
18219 const ARMSubtarget *ST) {
18220
18221 // The hwloop intrinsics that we're interested are used for control-flow,
18222 // either for entering or exiting the loop:
18223 // - test.start.loop.iterations will test whether its operand is zero. If it
18224 // is zero, the proceeding branch should not enter the loop.
18225 // - loop.decrement.reg also tests whether its operand is zero. If it is
18226 // zero, the proceeding branch should not branch back to the beginning of
18227 // the loop.
18228 // So here, we need to check that how the brcond is using the result of each
18229 // of the intrinsics to ensure that we're branching to the right place at the
18230 // right time.
18231
18232 ISD::CondCode CC;
18233 SDValue Cond;
18234 int Imm = 1;
18235 bool Negate = false;
18236 SDValue Chain = N->getOperand(0);
18237 SDValue Dest;
18238
18239 if (N->getOpcode() == ISD::BRCOND) {
18240 CC = ISD::SETEQ;
18241 Cond = N->getOperand(1);
18242 Dest = N->getOperand(2);
18243 } else {
18244 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18245 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18246 Cond = N->getOperand(2);
18247 Dest = N->getOperand(4);
18248 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18249 if (!Const->isOne() && !Const->isZero())
18250 return SDValue();
18251 Imm = Const->getZExtValue();
18252 } else
18253 return SDValue();
18254 }
18255
18256 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18257 if (!Int)
18258 return SDValue();
18259
18260 if (Negate)
18261 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18262
18263 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18264 return (CC == ISD::SETEQ && Imm == 0) ||
18265 (CC == ISD::SETNE && Imm == 1) ||
18266 (CC == ISD::SETLT && Imm == 1) ||
18267 (CC == ISD::SETULT && Imm == 1);
18268 };
18269
18270 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18271 return (CC == ISD::SETEQ && Imm == 1) ||
18272 (CC == ISD::SETNE && Imm == 0) ||
18273 (CC == ISD::SETGT && Imm == 0) ||
18274 (CC == ISD::SETUGT && Imm == 0) ||
18275 (CC == ISD::SETGE && Imm == 1) ||
18276 (CC == ISD::SETUGE && Imm == 1);
18277 };
18278
18279 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18280 "unsupported condition");
18281
18282 SDLoc dl(Int);
18283 SelectionDAG &DAG = DCI.DAG;
18284 SDValue Elements = Int.getOperand(2);
18285 unsigned IntOp = Int->getConstantOperandVal(1);
18286 assert((N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BR) &&
18287 "expected single br user");
18288 SDNode *Br = *N->user_begin();
18289 SDValue OtherTarget = Br->getOperand(1);
18290
18291 // Update the unconditional branch to branch to the given Dest.
18292 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18293 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18294 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18295 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18296 };
18297
18298 if (IntOp == Intrinsic::test_start_loop_iterations) {
18299 SDValue Res;
18300 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18301 // We expect this 'instruction' to branch when the counter is zero.
18302 if (IsTrueIfZero(CC, Imm)) {
18303 SDValue Ops[] = {Chain, Setup, Dest};
18304 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18305 } else {
18306 // The logic is the reverse of what we need for WLS, so find the other
18307 // basic block target: the target of the proceeding br.
18308 UpdateUncondBr(Br, Dest, DAG);
18309
18310 SDValue Ops[] = {Chain, Setup, OtherTarget};
18311 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18312 }
18313 // Update LR count to the new value
18314 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18315 // Update chain
18316 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18317 return Res;
18318 } else {
18319 SDValue Size =
18320 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18321 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18322 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18323 DAG.getVTList(MVT::i32, MVT::Other), Args);
18324 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18325
18326 // We expect this instruction to branch when the count is not zero.
18327 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18328
18329 // Update the unconditional branch to target the loop preheader if we've
18330 // found the condition has been reversed.
18331 if (Target == OtherTarget)
18332 UpdateUncondBr(Br, Dest, DAG);
18333
18334 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18335 SDValue(LoopDec.getNode(), 1), Chain);
18336
18337 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18338 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18339 }
18340 return SDValue();
18341}
18342
18343/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18344SDValue
18346 SDValue Cmp = N->getOperand(3);
18347 if (Cmp.getOpcode() != ARMISD::CMPZ)
18348 // Only looking at NE cases.
18349 return SDValue();
18350
18351 SDLoc dl(N);
18352 SDValue LHS = Cmp.getOperand(0);
18353 SDValue RHS = Cmp.getOperand(1);
18354 SDValue Chain = N->getOperand(0);
18355 SDValue BB = N->getOperand(1);
18356 SDValue ARMcc = N->getOperand(2);
18358
18359 // (brcond Chain BB ne (cmpz (and (cmov 0 1 CC Flags) 1) 0))
18360 // -> (brcond Chain BB CC Flags)
18361 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18362 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18363 LHS->getOperand(0)->hasOneUse() &&
18364 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18365 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18366 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18367 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, BB,
18368 LHS->getOperand(0)->getOperand(2),
18369 LHS->getOperand(0)->getOperand(3));
18370 }
18371
18372 return SDValue();
18373}
18374
18375/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18376SDValue
18378 SDValue Cmp = N->getOperand(3);
18379 if (Cmp.getOpcode() != ARMISD::CMPZ)
18380 // Only looking at EQ and NE cases.
18381 return SDValue();
18382
18383 EVT VT = N->getValueType(0);
18384 SDLoc dl(N);
18385 SDValue LHS = Cmp.getOperand(0);
18386 SDValue RHS = Cmp.getOperand(1);
18387 SDValue FalseVal = N->getOperand(0);
18388 SDValue TrueVal = N->getOperand(1);
18389 SDValue ARMcc = N->getOperand(2);
18391
18392 // BFI is only available on V6T2+.
18393 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18395 if (R)
18396 return R;
18397 }
18398
18399 // Simplify
18400 // mov r1, r0
18401 // cmp r1, x
18402 // mov r0, y
18403 // moveq r0, x
18404 // to
18405 // cmp r0, x
18406 // movne r0, y
18407 //
18408 // mov r1, r0
18409 // cmp r1, x
18410 // mov r0, x
18411 // movne r0, y
18412 // to
18413 // cmp r0, x
18414 // movne r0, y
18415 /// FIXME: Turn this into a target neutral optimization?
18416 SDValue Res;
18417 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18418 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, Cmp);
18419 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18420 SDValue ARMcc;
18421 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18422 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, NewCmp);
18423 }
18424
18425 // (cmov F T ne (cmpz (cmov 0 1 CC Flags) 0))
18426 // -> (cmov F T CC Flags)
18427 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18428 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18429 isNullConstant(RHS)) {
18430 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18431 LHS->getOperand(2), LHS->getOperand(3));
18432 }
18433
18434 if (!VT.isInteger())
18435 return SDValue();
18436
18437 // Fold away an unneccessary CMPZ/CMOV
18438 // CMOV A, B, C1, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18439 // if C1==EQ -> CMOV A, B, C2, D
18440 // if C1==NE -> CMOV A, B, NOT(C2), D
18441 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18442 N->getConstantOperandVal(2) == ARMCC::NE) {
18444 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
18445 if (N->getConstantOperandVal(2) == ARMCC::NE)
18447 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18448 N->getOperand(1),
18449 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
18450 }
18451 }
18452
18453 // Materialize a boolean comparison for integers so we can avoid branching.
18454 if (isNullConstant(FalseVal)) {
18455 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18456 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18457 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18458 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18459 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18460 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18461 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18462 DAG.getConstant(5, dl, MVT::i32));
18463 } else {
18464 // CMOV 0, 1, ==, (CMPZ x, y) ->
18465 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18466 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18467 //
18468 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18469 // x != y. In other words, a carry C == 1 when x == y, C == 0
18470 // otherwise.
18471 // The final UADDO_CARRY computes
18472 // x - y + (0 - (x - y)) + C == C
18473 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18474 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18475 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18476 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18477 // actually.
18478 SDValue Carry =
18479 DAG.getNode(ISD::SUB, dl, MVT::i32,
18480 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18481 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18482 }
18483 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18484 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18485 // This seems pointless but will allow us to combine it further below.
18486 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18487 SDValue Sub =
18488 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18489 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18490 Sub.getValue(1));
18491 FalseVal = Sub;
18492 }
18493 } else if (isNullConstant(TrueVal)) {
18494 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18495 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18496 // This seems pointless but will allow us to combine it further below
18497 // Note that we change == for != as this is the dual for the case above.
18498 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18499 SDValue Sub =
18500 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18501 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18502 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18503 Sub.getValue(1));
18504 FalseVal = Sub;
18505 }
18506 }
18507
18508 // On Thumb1, the DAG above may be further combined if z is a power of 2
18509 // (z == 2 ^ K).
18510 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18511 // t1 = (USUBO (SUB x, y), 1)
18512 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18513 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18514 //
18515 // This also handles the special case of comparing against zero; it's
18516 // essentially, the same pattern, except there's no SUBC:
18517 // CMOV x, z, !=, (CMPZ x, 0) ->
18518 // t1 = (USUBO x, 1)
18519 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18520 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18521 const APInt *TrueConst;
18522 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18523 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18524 FalseVal.getOperand(1) == RHS) ||
18525 (FalseVal == LHS && isNullConstant(RHS))) &&
18526 (TrueConst = isPowerOf2Constant(TrueVal))) {
18527 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18528 unsigned ShiftAmount = TrueConst->logBase2();
18529 if (ShiftAmount)
18530 TrueVal = DAG.getConstant(1, dl, VT);
18531 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18532 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18533 Subc.getValue(1));
18534
18535 if (ShiftAmount)
18536 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18537 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18538 }
18539
18540 if (Res.getNode()) {
18541 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18542 // Capture demanded bits information that would be otherwise lost.
18543 if (Known.Zero == 0xfffffffe)
18544 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18545 DAG.getValueType(MVT::i1));
18546 else if (Known.Zero == 0xffffff00)
18547 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18548 DAG.getValueType(MVT::i8));
18549 else if (Known.Zero == 0xffff0000)
18550 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18551 DAG.getValueType(MVT::i16));
18552 }
18553
18554 return Res;
18555}
18556
18559 const ARMSubtarget *ST) {
18560 SelectionDAG &DAG = DCI.DAG;
18561 SDValue Src = N->getOperand(0);
18562 EVT DstVT = N->getValueType(0);
18563
18564 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18565 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18566 EVT SrcVT = Src.getValueType();
18567 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18568 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18569 }
18570
18571 // We may have a bitcast of something that has already had this bitcast
18572 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18573 if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
18574 Src.getOperand(0).getValueType().getScalarSizeInBits() <=
18575 Src.getValueType().getScalarSizeInBits())
18576 Src = Src.getOperand(0);
18577
18578 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18579 // would be generated is at least the width of the element type.
18580 EVT SrcVT = Src.getValueType();
18581 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18582 Src.getOpcode() == ARMISD::VMVNIMM ||
18583 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18584 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18585 DAG.getDataLayout().isBigEndian())
18586 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18587
18588 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18589 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18590 return R;
18591
18592 return SDValue();
18593}
18594
18595// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18596// node into stack operations after legalizeOps.
18599 SelectionDAG &DAG = DCI.DAG;
18600 EVT VT = N->getValueType(0);
18601 SDLoc DL(N);
18602
18603 // MVETrunc(Undef, Undef) -> Undef
18604 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18605 return DAG.getUNDEF(VT);
18606
18607 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18608 if (N->getNumOperands() == 2 &&
18609 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18610 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18611 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18612 N->getOperand(0).getOperand(1),
18613 N->getOperand(1).getOperand(0),
18614 N->getOperand(1).getOperand(1));
18615
18616 // MVETrunc(shuffle, shuffle) -> VMOVN
18617 if (N->getNumOperands() == 2 &&
18618 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18619 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18620 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18621 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18622
18623 if (S0->getOperand(0) == S1->getOperand(0) &&
18624 S0->getOperand(1) == S1->getOperand(1)) {
18625 // Construct complete shuffle mask
18626 SmallVector<int, 8> Mask(S0->getMask());
18627 Mask.append(S1->getMask().begin(), S1->getMask().end());
18628
18629 if (isVMOVNTruncMask(Mask, VT, false))
18630 return DAG.getNode(
18631 ARMISD::VMOVN, DL, VT,
18632 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18633 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18634 DAG.getConstant(1, DL, MVT::i32));
18635 if (isVMOVNTruncMask(Mask, VT, true))
18636 return DAG.getNode(
18637 ARMISD::VMOVN, DL, VT,
18638 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18639 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18640 DAG.getConstant(1, DL, MVT::i32));
18641 }
18642 }
18643
18644 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18645 // truncate to a buildvector to allow the generic optimisations to kick in.
18646 if (all_of(N->ops(), [](SDValue Op) {
18647 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18648 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18649 (Op.getOpcode() == ISD::BITCAST &&
18650 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18651 })) {
18652 SmallVector<SDValue, 8> Extracts;
18653 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18654 SDValue O = N->getOperand(Op);
18655 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18656 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18657 DAG.getConstant(i, DL, MVT::i32));
18658 Extracts.push_back(Ext);
18659 }
18660 }
18661 return DAG.getBuildVector(VT, DL, Extracts);
18662 }
18663
18664 // If we are late in the legalization process and nothing has optimised
18665 // the trunc to anything better, lower it to a stack store and reload,
18666 // performing the truncation whilst keeping the lanes in the correct order:
18667 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18668 if (!DCI.isAfterLegalizeDAG())
18669 return SDValue();
18670
18671 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18672 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18673 int NumIns = N->getNumOperands();
18674 assert((NumIns == 2 || NumIns == 4) &&
18675 "Expected 2 or 4 inputs to an MVETrunc");
18676 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18677 if (N->getNumOperands() == 4)
18678 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18679
18680 SmallVector<SDValue> Chains;
18681 for (int I = 0; I < NumIns; I++) {
18682 SDValue Ptr = DAG.getNode(
18683 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18684 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18686 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18687 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18688 Ptr, MPI, StoreVT, Align(4));
18689 Chains.push_back(Ch);
18690 }
18691
18692 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18693 MachinePointerInfo MPI =
18695 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18696}
18697
18698// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18700 SelectionDAG &DAG) {
18701 SDValue N0 = N->getOperand(0);
18703 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18704 return SDValue();
18705
18706 EVT FromVT = LD->getMemoryVT();
18707 EVT ToVT = N->getValueType(0);
18708 if (!ToVT.isVector())
18709 return SDValue();
18710 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18711 EVT ToEltVT = ToVT.getVectorElementType();
18712 EVT FromEltVT = FromVT.getVectorElementType();
18713
18714 unsigned NumElements = 0;
18715 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18716 NumElements = 4;
18717 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18718 NumElements = 8;
18719 assert(NumElements != 0);
18720
18721 ISD::LoadExtType NewExtType =
18722 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18723 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18724 LD->getExtensionType() != ISD::EXTLOAD &&
18725 LD->getExtensionType() != NewExtType)
18726 return SDValue();
18727
18728 LLVMContext &C = *DAG.getContext();
18729 SDLoc DL(LD);
18730 // Details about the old load
18731 SDValue Ch = LD->getChain();
18732 SDValue BasePtr = LD->getBasePtr();
18733 Align Alignment = LD->getBaseAlign();
18734 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18735 AAMDNodes AAInfo = LD->getAAInfo();
18736
18737 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18738 EVT NewFromVT = EVT::getVectorVT(
18739 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18740 EVT NewToVT = EVT::getVectorVT(
18741 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18742
18745 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18746 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18747 SDValue NewPtr =
18748 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18749
18750 SDValue NewLoad =
18751 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18752 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18753 Alignment, MMOFlags, AAInfo);
18754 Loads.push_back(NewLoad);
18755 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18756 }
18757
18758 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18759 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18760 return DAG.getMergeValues(Loads, DL);
18761}
18762
18763// Perform combines for MVEEXT. If it has not be optimized to anything better
18764// before lowering, it gets converted to stack store and extloads performing the
18765// extend whilst still keeping the same lane ordering.
18768 SelectionDAG &DAG = DCI.DAG;
18769 EVT VT = N->getValueType(0);
18770 SDLoc DL(N);
18771 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18772 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18773
18774 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18775 *DAG.getContext());
18776 auto Extend = [&](SDValue V) {
18777 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18778 return N->getOpcode() == ARMISD::MVESEXT
18779 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18780 DAG.getValueType(ExtVT))
18781 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18782 };
18783
18784 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18785 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18786 SDValue Ext = Extend(N->getOperand(0));
18787 return DAG.getMergeValues({Ext, Ext}, DL);
18788 }
18789
18790 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18791 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18792 ArrayRef<int> Mask = SVN->getMask();
18793 assert(Mask.size() == 2 * VT.getVectorNumElements());
18794 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18795 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18796 SDValue Op0 = SVN->getOperand(0);
18797 SDValue Op1 = SVN->getOperand(1);
18798
18799 auto CheckInregMask = [&](int Start, int Offset) {
18800 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18801 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18802 return false;
18803 return true;
18804 };
18805 SDValue V0 = SDValue(N, 0);
18806 SDValue V1 = SDValue(N, 1);
18807 if (CheckInregMask(0, 0))
18808 V0 = Extend(Op0);
18809 else if (CheckInregMask(0, 1))
18810 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18811 else if (CheckInregMask(0, Mask.size()))
18812 V0 = Extend(Op1);
18813 else if (CheckInregMask(0, Mask.size() + 1))
18814 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18815
18816 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18817 V1 = Extend(Op1);
18818 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18819 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18820 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18821 V1 = Extend(Op0);
18822 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18823 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18824
18825 if (V0.getNode() != N || V1.getNode() != N)
18826 return DAG.getMergeValues({V0, V1}, DL);
18827 }
18828
18829 // MVEEXT(load) -> extload, extload
18830 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
18832 return L;
18833
18834 if (!DCI.isAfterLegalizeDAG())
18835 return SDValue();
18836
18837 // Lower to a stack store and reload:
18838 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
18839 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18840 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18841 int NumOuts = N->getNumValues();
18842 assert((NumOuts == 2 || NumOuts == 4) &&
18843 "Expected 2 or 4 outputs to an MVEEXT");
18844 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18845 *DAG.getContext());
18846 if (N->getNumOperands() == 4)
18847 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
18848
18849 MachinePointerInfo MPI =
18851 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
18852 StackPtr, MPI, Align(4));
18853
18855 for (int I = 0; I < NumOuts; I++) {
18856 SDValue Ptr = DAG.getNode(
18857 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18858 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
18860 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
18861 SDValue Load = DAG.getExtLoad(
18862 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
18863 VT, Chain, Ptr, MPI, LoadVT, Align(4));
18864 Loads.push_back(Load);
18865 }
18866
18867 return DAG.getMergeValues(Loads, DL);
18868}
18869
18871 DAGCombinerInfo &DCI) const {
18872 switch (N->getOpcode()) {
18873 default: break;
18874 case ISD::SELECT_CC:
18875 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
18876 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
18877 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
18878 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
18879 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
18880 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
18881 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
18882 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
18883 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
18884 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
18885 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
18886 case ISD::BRCOND:
18887 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
18888 case ARMISD::ADDC:
18889 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
18890 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
18891 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
18892 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
18893 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
18894 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
18895 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
18896 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
18897 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
18900 return PerformExtractEltCombine(N, DCI, Subtarget);
18904 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
18905 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
18906 case ISD::FP_TO_SINT:
18907 case ISD::FP_TO_UINT:
18908 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
18909 case ISD::FADD:
18910 return PerformFADDCombine(N, DCI.DAG, Subtarget);
18911 case ISD::FMUL:
18912 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
18914 return PerformIntrinsicCombine(N, DCI);
18915 case ISD::SHL:
18916 case ISD::SRA:
18917 case ISD::SRL:
18918 return PerformShiftCombine(N, DCI, Subtarget);
18919 case ISD::SIGN_EXTEND:
18920 case ISD::ZERO_EXTEND:
18921 case ISD::ANY_EXTEND:
18922 return PerformExtendCombine(N, DCI.DAG, Subtarget);
18923 case ISD::FP_EXTEND:
18924 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
18925 case ISD::SMIN:
18926 case ISD::UMIN:
18927 case ISD::SMAX:
18928 case ISD::UMAX:
18929 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
18930 case ARMISD::CMOV:
18931 return PerformCMOVCombine(N, DCI.DAG);
18932 case ARMISD::BRCOND:
18933 return PerformBRCONDCombine(N, DCI.DAG);
18934 case ARMISD::CMPZ:
18935 return PerformCMPZCombine(N, DCI.DAG);
18936 case ARMISD::CSINC:
18937 case ARMISD::CSINV:
18938 case ARMISD::CSNEG:
18939 return PerformCSETCombine(N, DCI.DAG);
18940 case ISD::LOAD:
18941 return PerformLOADCombine(N, DCI, Subtarget);
18942 case ARMISD::VLD1DUP:
18943 case ARMISD::VLD2DUP:
18944 case ARMISD::VLD3DUP:
18945 case ARMISD::VLD4DUP:
18946 return PerformVLDCombine(N, DCI);
18948 return PerformARMBUILD_VECTORCombine(N, DCI);
18949 case ISD::BITCAST:
18950 return PerformBITCASTCombine(N, DCI, Subtarget);
18952 return PerformPREDICATE_CASTCombine(N, DCI);
18954 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
18955 case ARMISD::MVETRUNC:
18956 return PerformMVETruncCombine(N, DCI);
18957 case ARMISD::MVESEXT:
18958 case ARMISD::MVEZEXT:
18959 return PerformMVEExtCombine(N, DCI);
18960 case ARMISD::VCMP:
18961 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
18962 case ISD::VECREDUCE_ADD:
18963 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
18964 case ARMISD::VADDVs:
18965 case ARMISD::VADDVu:
18966 case ARMISD::VADDLVs:
18967 case ARMISD::VADDLVu:
18968 case ARMISD::VADDLVAs:
18969 case ARMISD::VADDLVAu:
18970 case ARMISD::VMLAVs:
18971 case ARMISD::VMLAVu:
18972 case ARMISD::VMLALVs:
18973 case ARMISD::VMLALVu:
18974 case ARMISD::VMLALVAs:
18975 case ARMISD::VMLALVAu:
18976 return PerformReduceShuffleCombine(N, DCI.DAG);
18977 case ARMISD::VMOVN:
18978 return PerformVMOVNCombine(N, DCI);
18979 case ARMISD::VQMOVNs:
18980 case ARMISD::VQMOVNu:
18981 return PerformVQMOVNCombine(N, DCI);
18982 case ARMISD::VQDMULH:
18983 return PerformVQDMULHCombine(N, DCI);
18984 case ARMISD::ASRL:
18985 case ARMISD::LSRL:
18986 case ARMISD::LSLL:
18987 return PerformLongShiftCombine(N, DCI.DAG);
18988 case ARMISD::SMULWB: {
18989 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18990 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
18991 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
18992 return SDValue();
18993 break;
18994 }
18995 case ARMISD::SMULWT: {
18996 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18997 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
18998 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
18999 return SDValue();
19000 break;
19001 }
19002 case ARMISD::SMLALBB:
19003 case ARMISD::QADD16b:
19004 case ARMISD::QSUB16b:
19005 case ARMISD::UQADD16b:
19006 case ARMISD::UQSUB16b: {
19007 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19008 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19009 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19010 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19011 return SDValue();
19012 break;
19013 }
19014 case ARMISD::SMLALBT: {
19015 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
19016 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19017 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
19018 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19019 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
19020 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
19021 return SDValue();
19022 break;
19023 }
19024 case ARMISD::SMLALTB: {
19025 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
19026 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19027 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
19028 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19029 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
19030 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
19031 return SDValue();
19032 break;
19033 }
19034 case ARMISD::SMLALTT: {
19035 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19036 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19037 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19038 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19039 return SDValue();
19040 break;
19041 }
19042 case ARMISD::QADD8b:
19043 case ARMISD::QSUB8b:
19044 case ARMISD::UQADD8b:
19045 case ARMISD::UQSUB8b: {
19046 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19047 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
19048 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19049 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19050 return SDValue();
19051 break;
19052 }
19053 case ARMISD::VBSP:
19054 if (N->getOperand(1) == N->getOperand(2))
19055 return N->getOperand(1);
19056 return SDValue();
19059 switch (N->getConstantOperandVal(1)) {
19060 case Intrinsic::arm_neon_vld1:
19061 case Intrinsic::arm_neon_vld1x2:
19062 case Intrinsic::arm_neon_vld1x3:
19063 case Intrinsic::arm_neon_vld1x4:
19064 case Intrinsic::arm_neon_vld2:
19065 case Intrinsic::arm_neon_vld3:
19066 case Intrinsic::arm_neon_vld4:
19067 case Intrinsic::arm_neon_vld2lane:
19068 case Intrinsic::arm_neon_vld3lane:
19069 case Intrinsic::arm_neon_vld4lane:
19070 case Intrinsic::arm_neon_vld2dup:
19071 case Intrinsic::arm_neon_vld3dup:
19072 case Intrinsic::arm_neon_vld4dup:
19073 case Intrinsic::arm_neon_vst1:
19074 case Intrinsic::arm_neon_vst1x2:
19075 case Intrinsic::arm_neon_vst1x3:
19076 case Intrinsic::arm_neon_vst1x4:
19077 case Intrinsic::arm_neon_vst2:
19078 case Intrinsic::arm_neon_vst3:
19079 case Intrinsic::arm_neon_vst4:
19080 case Intrinsic::arm_neon_vst2lane:
19081 case Intrinsic::arm_neon_vst3lane:
19082 case Intrinsic::arm_neon_vst4lane:
19083 return PerformVLDCombine(N, DCI);
19084 case Intrinsic::arm_mve_vld2q:
19085 case Intrinsic::arm_mve_vld4q:
19086 case Intrinsic::arm_mve_vst2q:
19087 case Intrinsic::arm_mve_vst4q:
19088 return PerformMVEVLDCombine(N, DCI);
19089 default: break;
19090 }
19091 break;
19092 }
19093 return SDValue();
19094}
19095
19097 EVT VT) const {
19098 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
19099}
19100
19102 Align Alignment,
19104 unsigned *Fast) const {
19105 // Depends what it gets converted into if the type is weird.
19106 if (!VT.isSimple())
19107 return false;
19108
19109 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
19110 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
19111 auto Ty = VT.getSimpleVT().SimpleTy;
19112
19113 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
19114 // Unaligned access can use (for example) LRDB, LRDH, LDR
19115 if (AllowsUnaligned) {
19116 if (Fast)
19117 *Fast = Subtarget->hasV7Ops();
19118 return true;
19119 }
19120 }
19121
19122 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
19123 // For any little-endian targets with neon, we can support unaligned ld/st
19124 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
19125 // A big-endian target may also explicitly support unaligned accesses
19126 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
19127 if (Fast)
19128 *Fast = 1;
19129 return true;
19130 }
19131 }
19132
19133 if (!Subtarget->hasMVEIntegerOps())
19134 return false;
19135
19136 // These are for predicates
19137 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
19138 Ty == MVT::v2i1)) {
19139 if (Fast)
19140 *Fast = 1;
19141 return true;
19142 }
19143
19144 // These are for truncated stores/narrowing loads. They are fine so long as
19145 // the alignment is at least the size of the item being loaded
19146 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
19147 Alignment >= VT.getScalarSizeInBits() / 8) {
19148 if (Fast)
19149 *Fast = true;
19150 return true;
19151 }
19152
19153 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
19154 // VSTRW.U32 all store the vector register in exactly the same format, and
19155 // differ only in the range of their immediate offset field and the required
19156 // alignment. So there is always a store that can be used, regardless of
19157 // actual type.
19158 //
19159 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
19160 // VREV64.8) pair and get the same effect. This will likely be better than
19161 // aligning the vector through the stack.
19162 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
19163 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
19164 Ty == MVT::v2f64) {
19165 if (Fast)
19166 *Fast = 1;
19167 return true;
19168 }
19169
19170 return false;
19171}
19172
19174 LLVMContext &Context, const MemOp &Op,
19175 const AttributeList &FuncAttributes) const {
19176 // See if we can use NEON instructions for this...
19177 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
19178 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
19179 unsigned Fast;
19180 if (Op.size() >= 16 &&
19181 (Op.isAligned(Align(16)) ||
19182 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
19184 Fast))) {
19185 return MVT::v2f64;
19186 } else if (Op.size() >= 8 &&
19187 (Op.isAligned(Align(8)) ||
19189 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
19190 Fast))) {
19191 return MVT::f64;
19192 }
19193 }
19194
19195 // Let the target-independent logic figure it out.
19196 return MVT::Other;
19197}
19198
19199// 64-bit integers are split into their high and low parts and held in two
19200// different registers, so the trunc is free since the low register can just
19201// be used.
19202bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19203 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19204 return false;
19205 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19206 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19207 return (SrcBits == 64 && DestBits == 32);
19208}
19209
19211 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19212 !DstVT.isInteger())
19213 return false;
19214 unsigned SrcBits = SrcVT.getSizeInBits();
19215 unsigned DestBits = DstVT.getSizeInBits();
19216 return (SrcBits == 64 && DestBits == 32);
19217}
19218
19220 if (Val.getOpcode() != ISD::LOAD)
19221 return false;
19222
19223 EVT VT1 = Val.getValueType();
19224 if (!VT1.isSimple() || !VT1.isInteger() ||
19225 !VT2.isSimple() || !VT2.isInteger())
19226 return false;
19227
19228 switch (VT1.getSimpleVT().SimpleTy) {
19229 default: break;
19230 case MVT::i1:
19231 case MVT::i8:
19232 case MVT::i16:
19233 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19234 return true;
19235 }
19236
19237 return false;
19238}
19239
19241 if (!VT.isSimple())
19242 return false;
19243
19244 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19245 // negate values directly (fneg is free). So, we don't want to let the DAG
19246 // combiner rewrite fneg into xors and some other instructions. For f16 and
19247 // FullFP16 argument passing, some bitcast nodes may be introduced,
19248 // triggering this DAG combine rewrite, so we are avoiding that with this.
19249 switch (VT.getSimpleVT().SimpleTy) {
19250 default: break;
19251 case MVT::f16:
19252 return Subtarget->hasFullFP16();
19253 }
19254
19255 return false;
19256}
19257
19259 if (!Subtarget->hasMVEIntegerOps())
19260 return nullptr;
19261 Type *SVIType = SVI->getType();
19262 Type *ScalarType = SVIType->getScalarType();
19263
19264 if (ScalarType->isFloatTy())
19265 return Type::getInt32Ty(SVIType->getContext());
19266 if (ScalarType->isHalfTy())
19267 return Type::getInt16Ty(SVIType->getContext());
19268 return nullptr;
19269}
19270
19272 EVT VT = ExtVal.getValueType();
19273
19274 if (!isTypeLegal(VT))
19275 return false;
19276
19277 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19278 if (Ld->isExpandingLoad())
19279 return false;
19280 }
19281
19282 if (Subtarget->hasMVEIntegerOps())
19283 return true;
19284
19285 // Don't create a loadext if we can fold the extension into a wide/long
19286 // instruction.
19287 // If there's more than one user instruction, the loadext is desirable no
19288 // matter what. There can be two uses by the same instruction.
19289 if (ExtVal->use_empty() ||
19290 !ExtVal->user_begin()->isOnlyUserOf(ExtVal.getNode()))
19291 return true;
19292
19293 SDNode *U = *ExtVal->user_begin();
19294 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19295 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19296 return false;
19297
19298 return true;
19299}
19300
19302 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19303 return false;
19304
19305 if (!isTypeLegal(EVT::getEVT(Ty1)))
19306 return false;
19307
19308 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19309
19310 // Assuming the caller doesn't have a zeroext or signext return parameter,
19311 // truncation all the way down to i1 is valid.
19312 return true;
19313}
19314
19315/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19316/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19317/// expanded to FMAs when this method returns true, otherwise fmuladd is
19318/// expanded to fmul + fadd.
19319///
19320/// ARM supports both fused and unfused multiply-add operations; we already
19321/// lower a pair of fmul and fadd to the latter so it's not clear that there
19322/// would be a gain or that the gain would be worthwhile enough to risk
19323/// correctness bugs.
19324///
19325/// For MVE, we set this to true as it helps simplify the need for some
19326/// patterns (and we don't have the non-fused floating point instruction).
19327bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19328 EVT VT) const {
19329 if (Subtarget->useSoftFloat())
19330 return false;
19331
19332 if (!VT.isSimple())
19333 return false;
19334
19335 switch (VT.getSimpleVT().SimpleTy) {
19336 case MVT::v4f32:
19337 case MVT::v8f16:
19338 return Subtarget->hasMVEFloatOps();
19339 case MVT::f16:
19340 return Subtarget->useFPVFMx16();
19341 case MVT::f32:
19342 return Subtarget->useFPVFMx();
19343 case MVT::f64:
19344 return Subtarget->useFPVFMx64();
19345 default:
19346 break;
19347 }
19348
19349 return false;
19350}
19351
19352static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19353 if (V < 0)
19354 return false;
19355
19356 unsigned Scale = 1;
19357 switch (VT.getSimpleVT().SimpleTy) {
19358 case MVT::i1:
19359 case MVT::i8:
19360 // Scale == 1;
19361 break;
19362 case MVT::i16:
19363 // Scale == 2;
19364 Scale = 2;
19365 break;
19366 default:
19367 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19368 // Scale == 4;
19369 Scale = 4;
19370 break;
19371 }
19372
19373 if ((V & (Scale - 1)) != 0)
19374 return false;
19375 return isUInt<5>(V / Scale);
19376}
19377
19378static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19379 const ARMSubtarget *Subtarget) {
19380 if (!VT.isInteger() && !VT.isFloatingPoint())
19381 return false;
19382 if (VT.isVector() && Subtarget->hasNEON())
19383 return false;
19384 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19385 !Subtarget->hasMVEFloatOps())
19386 return false;
19387
19388 bool IsNeg = false;
19389 if (V < 0) {
19390 IsNeg = true;
19391 V = -V;
19392 }
19393
19394 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19395
19396 // MVE: size * imm7
19397 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19398 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19399 case MVT::i32:
19400 case MVT::f32:
19401 return isShiftedUInt<7,2>(V);
19402 case MVT::i16:
19403 case MVT::f16:
19404 return isShiftedUInt<7,1>(V);
19405 case MVT::i8:
19406 return isUInt<7>(V);
19407 default:
19408 return false;
19409 }
19410 }
19411
19412 // half VLDR: 2 * imm8
19413 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19414 return isShiftedUInt<8, 1>(V);
19415 // VLDR and LDRD: 4 * imm8
19416 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19417 return isShiftedUInt<8, 2>(V);
19418
19419 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19420 // + imm12 or - imm8
19421 if (IsNeg)
19422 return isUInt<8>(V);
19423 return isUInt<12>(V);
19424 }
19425
19426 return false;
19427}
19428
19429/// isLegalAddressImmediate - Return true if the integer value can be used
19430/// as the offset of the target addressing mode for load / store of the
19431/// given type.
19432static bool isLegalAddressImmediate(int64_t V, EVT VT,
19433 const ARMSubtarget *Subtarget) {
19434 if (V == 0)
19435 return true;
19436
19437 if (!VT.isSimple())
19438 return false;
19439
19440 if (Subtarget->isThumb1Only())
19441 return isLegalT1AddressImmediate(V, VT);
19442 else if (Subtarget->isThumb2())
19443 return isLegalT2AddressImmediate(V, VT, Subtarget);
19444
19445 // ARM mode.
19446 if (V < 0)
19447 V = - V;
19448 switch (VT.getSimpleVT().SimpleTy) {
19449 default: return false;
19450 case MVT::i1:
19451 case MVT::i8:
19452 case MVT::i32:
19453 // +- imm12
19454 return isUInt<12>(V);
19455 case MVT::i16:
19456 // +- imm8
19457 return isUInt<8>(V);
19458 case MVT::f32:
19459 case MVT::f64:
19460 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19461 return false;
19462 return isShiftedUInt<8, 2>(V);
19463 }
19464}
19465
19467 EVT VT) const {
19468 int Scale = AM.Scale;
19469 if (Scale < 0)
19470 return false;
19471
19472 switch (VT.getSimpleVT().SimpleTy) {
19473 default: return false;
19474 case MVT::i1:
19475 case MVT::i8:
19476 case MVT::i16:
19477 case MVT::i32:
19478 if (Scale == 1)
19479 return true;
19480 // r + r << imm
19481 Scale = Scale & ~1;
19482 return Scale == 2 || Scale == 4 || Scale == 8;
19483 case MVT::i64:
19484 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19485 // version in Thumb mode.
19486 // r + r
19487 if (Scale == 1)
19488 return true;
19489 // r * 2 (this can be lowered to r + r).
19490 if (!AM.HasBaseReg && Scale == 2)
19491 return true;
19492 return false;
19493 case MVT::isVoid:
19494 // Note, we allow "void" uses (basically, uses that aren't loads or
19495 // stores), because arm allows folding a scale into many arithmetic
19496 // operations. This should be made more precise and revisited later.
19497
19498 // Allow r << imm, but the imm has to be a multiple of two.
19499 if (Scale & 1) return false;
19500 return isPowerOf2_32(Scale);
19501 }
19502}
19503
19505 EVT VT) const {
19506 const int Scale = AM.Scale;
19507
19508 // Negative scales are not supported in Thumb1.
19509 if (Scale < 0)
19510 return false;
19511
19512 // Thumb1 addressing modes do not support register scaling excepting the
19513 // following cases:
19514 // 1. Scale == 1 means no scaling.
19515 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19516 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19517}
19518
19519/// isLegalAddressingMode - Return true if the addressing mode represented
19520/// by AM is legal for this target, for a load/store of the specified type.
19522 const AddrMode &AM, Type *Ty,
19523 unsigned AS, Instruction *I) const {
19524 EVT VT = getValueType(DL, Ty, true);
19525 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19526 return false;
19527
19528 // Can never fold addr of global into load/store.
19529 if (AM.BaseGV)
19530 return false;
19531
19532 switch (AM.Scale) {
19533 case 0: // no scale reg, must be "r+i" or "r", or "i".
19534 break;
19535 default:
19536 // ARM doesn't support any R+R*scale+imm addr modes.
19537 if (AM.BaseOffs)
19538 return false;
19539
19540 if (!VT.isSimple())
19541 return false;
19542
19543 if (Subtarget->isThumb1Only())
19544 return isLegalT1ScaledAddressingMode(AM, VT);
19545
19546 if (Subtarget->isThumb2())
19547 return isLegalT2ScaledAddressingMode(AM, VT);
19548
19549 int Scale = AM.Scale;
19550 switch (VT.getSimpleVT().SimpleTy) {
19551 default: return false;
19552 case MVT::i1:
19553 case MVT::i8:
19554 case MVT::i32:
19555 if (Scale < 0) Scale = -Scale;
19556 if (Scale == 1)
19557 return true;
19558 // r + r << imm
19559 return isPowerOf2_32(Scale & ~1);
19560 case MVT::i16:
19561 case MVT::i64:
19562 // r +/- r
19563 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19564 return true;
19565 // r * 2 (this can be lowered to r + r).
19566 if (!AM.HasBaseReg && Scale == 2)
19567 return true;
19568 return false;
19569
19570 case MVT::isVoid:
19571 // Note, we allow "void" uses (basically, uses that aren't loads or
19572 // stores), because arm allows folding a scale into many arithmetic
19573 // operations. This should be made more precise and revisited later.
19574
19575 // Allow r << imm, but the imm has to be a multiple of two.
19576 if (Scale & 1) return false;
19577 return isPowerOf2_32(Scale);
19578 }
19579 }
19580 return true;
19581}
19582
19583/// isLegalICmpImmediate - Return true if the specified immediate is legal
19584/// icmp immediate, that is the target has icmp instructions which can compare
19585/// a register against the immediate without having to materialize the
19586/// immediate into a register.
19588 // Thumb2 and ARM modes can use cmn for negative immediates.
19589 if (!Subtarget->isThumb())
19590 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19591 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19592 if (Subtarget->isThumb2())
19593 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19594 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19595 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19596 return Imm >= 0 && Imm <= 255;
19597}
19598
19599/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19600/// *or sub* immediate, that is the target has add or sub instructions which can
19601/// add a register with the immediate without having to materialize the
19602/// immediate into a register.
19604 // Same encoding for add/sub, just flip the sign.
19605 uint64_t AbsImm = AbsoluteValue(Imm);
19606 if (!Subtarget->isThumb())
19607 return ARM_AM::getSOImmVal(AbsImm) != -1;
19608 if (Subtarget->isThumb2())
19609 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19610 // Thumb1 only has 8-bit unsigned immediate.
19611 return AbsImm <= 255;
19612}
19613
19614// Return false to prevent folding
19615// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19616// if the folding leads to worse code.
19618 SDValue ConstNode) const {
19619 // Let the DAGCombiner decide for vector types and large types.
19620 const EVT VT = AddNode.getValueType();
19621 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19622 return true;
19623
19624 // It is worse if c0 is legal add immediate, while c1*c0 is not
19625 // and has to be composed by at least two instructions.
19626 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19627 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19628 const int64_t C0 = C0Node->getSExtValue();
19629 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19631 return true;
19632 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19633 return false;
19634
19635 // Default to true and let the DAGCombiner decide.
19636 return true;
19637}
19638
19640 bool isSEXTLoad, SDValue &Base,
19641 SDValue &Offset, bool &isInc,
19642 SelectionDAG &DAG) {
19643 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19644 return false;
19645
19646 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19647 // AddressingMode 3
19648 Base = Ptr->getOperand(0);
19649 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19650 int RHSC = (int)RHS->getZExtValue();
19651 if (RHSC < 0 && RHSC > -256) {
19652 assert(Ptr->getOpcode() == ISD::ADD);
19653 isInc = false;
19654 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19655 return true;
19656 }
19657 }
19658 isInc = (Ptr->getOpcode() == ISD::ADD);
19659 Offset = Ptr->getOperand(1);
19660 return true;
19661 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19662 // AddressingMode 2
19663 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19664 int RHSC = (int)RHS->getZExtValue();
19665 if (RHSC < 0 && RHSC > -0x1000) {
19666 assert(Ptr->getOpcode() == ISD::ADD);
19667 isInc = false;
19668 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19669 Base = Ptr->getOperand(0);
19670 return true;
19671 }
19672 }
19673
19674 if (Ptr->getOpcode() == ISD::ADD) {
19675 isInc = true;
19676 ARM_AM::ShiftOpc ShOpcVal=
19677 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
19678 if (ShOpcVal != ARM_AM::no_shift) {
19679 Base = Ptr->getOperand(1);
19680 Offset = Ptr->getOperand(0);
19681 } else {
19682 Base = Ptr->getOperand(0);
19683 Offset = Ptr->getOperand(1);
19684 }
19685 return true;
19686 }
19687
19688 isInc = (Ptr->getOpcode() == ISD::ADD);
19689 Base = Ptr->getOperand(0);
19690 Offset = Ptr->getOperand(1);
19691 return true;
19692 }
19693
19694 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19695 return false;
19696}
19697
19699 bool isSEXTLoad, SDValue &Base,
19700 SDValue &Offset, bool &isInc,
19701 SelectionDAG &DAG) {
19702 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19703 return false;
19704
19705 Base = Ptr->getOperand(0);
19706 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19707 int RHSC = (int)RHS->getZExtValue();
19708 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19709 assert(Ptr->getOpcode() == ISD::ADD);
19710 isInc = false;
19711 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19712 return true;
19713 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19714 isInc = Ptr->getOpcode() == ISD::ADD;
19715 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19716 return true;
19717 }
19718 }
19719
19720 return false;
19721}
19722
19723static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19724 bool isSEXTLoad, bool IsMasked, bool isLE,
19726 bool &isInc, SelectionDAG &DAG) {
19727 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19728 return false;
19729 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19730 return false;
19731
19732 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19733 // as opposed to a vldrw.32). This can allow extra addressing modes or
19734 // alignments for what is otherwise an equivalent instruction.
19735 bool CanChangeType = isLE && !IsMasked;
19736
19737 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
19738 int RHSC = (int)RHS->getZExtValue();
19739
19740 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19741 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19742 assert(Ptr->getOpcode() == ISD::ADD);
19743 isInc = false;
19744 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19745 return true;
19746 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19747 isInc = Ptr->getOpcode() == ISD::ADD;
19748 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19749 return true;
19750 }
19751 return false;
19752 };
19753
19754 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19755 // (in BE/masked) type.
19756 Base = Ptr->getOperand(0);
19757 if (VT == MVT::v4i16) {
19758 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19759 return true;
19760 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19761 if (IsInRange(RHSC, 0x80, 1))
19762 return true;
19763 } else if (Alignment >= 4 &&
19764 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19765 IsInRange(RHSC, 0x80, 4))
19766 return true;
19767 else if (Alignment >= 2 &&
19768 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19769 IsInRange(RHSC, 0x80, 2))
19770 return true;
19771 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19772 return true;
19773 return false;
19774}
19775
19776/// getPreIndexedAddressParts - returns true by value, base pointer and
19777/// offset pointer and addressing mode by reference if the node's address
19778/// can be legally represented as pre-indexed load / store address.
19779bool
19781 SDValue &Offset,
19783 SelectionDAG &DAG) const {
19784 if (Subtarget->isThumb1Only())
19785 return false;
19786
19787 EVT VT;
19788 SDValue Ptr;
19789 Align Alignment;
19790 bool isSEXTLoad = false;
19791 bool IsMasked = false;
19792 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19793 Ptr = LD->getBasePtr();
19794 VT = LD->getMemoryVT();
19795 Alignment = LD->getAlign();
19796 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19797 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19798 Ptr = ST->getBasePtr();
19799 VT = ST->getMemoryVT();
19800 Alignment = ST->getAlign();
19801 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19802 Ptr = LD->getBasePtr();
19803 VT = LD->getMemoryVT();
19804 Alignment = LD->getAlign();
19805 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19806 IsMasked = true;
19808 Ptr = ST->getBasePtr();
19809 VT = ST->getMemoryVT();
19810 Alignment = ST->getAlign();
19811 IsMasked = true;
19812 } else
19813 return false;
19814
19815 bool isInc;
19816 bool isLegal = false;
19817 if (VT.isVector())
19818 isLegal = Subtarget->hasMVEIntegerOps() &&
19820 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19821 Subtarget->isLittle(), Base, Offset, isInc, DAG);
19822 else {
19823 if (Subtarget->isThumb2())
19824 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19825 Offset, isInc, DAG);
19826 else
19827 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19828 Offset, isInc, DAG);
19829 }
19830 if (!isLegal)
19831 return false;
19832
19833 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
19834 return true;
19835}
19836
19837/// getPostIndexedAddressParts - returns true by value, base pointer and
19838/// offset pointer and addressing mode by reference if this node can be
19839/// combined with a load / store to form a post-indexed load / store.
19841 SDValue &Base,
19842 SDValue &Offset,
19844 SelectionDAG &DAG) const {
19845 EVT VT;
19846 SDValue Ptr;
19847 Align Alignment;
19848 bool isSEXTLoad = false, isNonExt;
19849 bool IsMasked = false;
19850 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19851 VT = LD->getMemoryVT();
19852 Ptr = LD->getBasePtr();
19853 Alignment = LD->getAlign();
19854 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19855 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19856 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19857 VT = ST->getMemoryVT();
19858 Ptr = ST->getBasePtr();
19859 Alignment = ST->getAlign();
19860 isNonExt = !ST->isTruncatingStore();
19861 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19862 VT = LD->getMemoryVT();
19863 Ptr = LD->getBasePtr();
19864 Alignment = LD->getAlign();
19865 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19866 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19867 IsMasked = true;
19869 VT = ST->getMemoryVT();
19870 Ptr = ST->getBasePtr();
19871 Alignment = ST->getAlign();
19872 isNonExt = !ST->isTruncatingStore();
19873 IsMasked = true;
19874 } else
19875 return false;
19876
19877 if (Subtarget->isThumb1Only()) {
19878 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
19879 // must be non-extending/truncating, i32, with an offset of 4.
19880 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
19881 if (Op->getOpcode() != ISD::ADD || !isNonExt)
19882 return false;
19883 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
19884 if (!RHS || RHS->getZExtValue() != 4)
19885 return false;
19886 if (Alignment < Align(4))
19887 return false;
19888
19889 Offset = Op->getOperand(1);
19890 Base = Op->getOperand(0);
19891 AM = ISD::POST_INC;
19892 return true;
19893 }
19894
19895 bool isInc;
19896 bool isLegal = false;
19897 if (VT.isVector())
19898 isLegal = Subtarget->hasMVEIntegerOps() &&
19899 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
19900 Subtarget->isLittle(), Base, Offset,
19901 isInc, DAG);
19902 else {
19903 if (Subtarget->isThumb2())
19904 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19905 isInc, DAG);
19906 else
19907 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19908 isInc, DAG);
19909 }
19910 if (!isLegal)
19911 return false;
19912
19913 if (Ptr != Base) {
19914 // Swap base ptr and offset to catch more post-index load / store when
19915 // it's legal. In Thumb2 mode, offset must be an immediate.
19916 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
19917 !Subtarget->isThumb2())
19919
19920 // Post-indexed load / store update the base pointer.
19921 if (Ptr != Base)
19922 return false;
19923 }
19924
19925 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
19926 return true;
19927}
19928
19930 KnownBits &Known,
19931 const APInt &DemandedElts,
19932 const SelectionDAG &DAG,
19933 unsigned Depth) const {
19934 unsigned BitWidth = Known.getBitWidth();
19935 Known.resetAll();
19936 switch (Op.getOpcode()) {
19937 default: break;
19938 case ARMISD::ADDC:
19939 case ARMISD::ADDE:
19940 case ARMISD::SUBC:
19941 case ARMISD::SUBE:
19942 // Special cases when we convert a carry to a boolean.
19943 if (Op.getResNo() == 0) {
19944 SDValue LHS = Op.getOperand(0);
19945 SDValue RHS = Op.getOperand(1);
19946 // (ADDE 0, 0, C) will give us a single bit.
19947 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
19948 isNullConstant(RHS)) {
19950 return;
19951 }
19952 }
19953 break;
19954 case ARMISD::CMOV: {
19955 // Bits are known zero/one if known on the LHS and RHS.
19956 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
19957 if (Known.isUnknown())
19958 return;
19959
19960 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
19961 Known = Known.intersectWith(KnownRHS);
19962 return;
19963 }
19965 Intrinsic::ID IntID =
19966 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
19967 switch (IntID) {
19968 default: return;
19969 case Intrinsic::arm_ldaex:
19970 case Intrinsic::arm_ldrex: {
19971 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
19972 unsigned MemBits = VT.getScalarSizeInBits();
19973 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
19974 return;
19975 }
19976 }
19977 }
19978 case ARMISD::BFI: {
19979 // Conservatively, we can recurse down the first operand
19980 // and just mask out all affected bits.
19981 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
19982
19983 // The operand to BFI is already a mask suitable for removing the bits it
19984 // sets.
19985 const APInt &Mask = Op.getConstantOperandAPInt(2);
19986 Known.Zero &= Mask;
19987 Known.One &= Mask;
19988 return;
19989 }
19990 case ARMISD::VGETLANEs:
19991 case ARMISD::VGETLANEu: {
19992 const SDValue &SrcSV = Op.getOperand(0);
19993 EVT VecVT = SrcSV.getValueType();
19994 assert(VecVT.isVector() && "VGETLANE expected a vector type");
19995 const unsigned NumSrcElts = VecVT.getVectorNumElements();
19996 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
19997 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
19998 "VGETLANE index out of bounds");
19999 unsigned Idx = Pos->getZExtValue();
20000 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
20001 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
20002
20003 EVT VT = Op.getValueType();
20004 const unsigned DstSz = VT.getScalarSizeInBits();
20005 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
20006 (void)SrcSz;
20007 assert(SrcSz == Known.getBitWidth());
20008 assert(DstSz > SrcSz);
20009 if (Op.getOpcode() == ARMISD::VGETLANEs)
20010 Known = Known.sext(DstSz);
20011 else {
20012 Known = Known.zext(DstSz);
20013 }
20014 assert(DstSz == Known.getBitWidth());
20015 break;
20016 }
20017 case ARMISD::VMOVrh: {
20018 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20019 assert(KnownOp.getBitWidth() == 16);
20020 Known = KnownOp.zext(32);
20021 break;
20022 }
20023 case ARMISD::CSINC:
20024 case ARMISD::CSINV:
20025 case ARMISD::CSNEG: {
20026 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20027 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
20028
20029 // The result is either:
20030 // CSINC: KnownOp0 or KnownOp1 + 1
20031 // CSINV: KnownOp0 or ~KnownOp1
20032 // CSNEG: KnownOp0 or KnownOp1 * -1
20033 if (Op.getOpcode() == ARMISD::CSINC)
20034 KnownOp1 =
20035 KnownBits::add(KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
20036 else if (Op.getOpcode() == ARMISD::CSINV)
20037 std::swap(KnownOp1.Zero, KnownOp1.One);
20038 else if (Op.getOpcode() == ARMISD::CSNEG)
20039 KnownOp1 = KnownBits::mul(KnownOp1,
20041
20042 Known = KnownOp0.intersectWith(KnownOp1);
20043 break;
20044 }
20045 case ARMISD::VORRIMM:
20046 case ARMISD::VBICIMM: {
20047 unsigned Encoded = Op.getConstantOperandVal(1);
20048 unsigned DecEltBits = 0;
20049 uint64_t DecodedVal = ARM_AM::decodeVMOVModImm(Encoded, DecEltBits);
20050
20051 unsigned EltBits = Op.getScalarValueSizeInBits();
20052 if (EltBits != DecEltBits) {
20053 // Be conservative: only update Known when EltBits == DecEltBits.
20054 // This is believed to always be true for VORRIMM/VBICIMM today, but if
20055 // that changes in the future, doing nothing here is safer than risking
20056 // subtle bugs.
20057 break;
20058 }
20059
20060 KnownBits KnownLHS = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20061 bool IsVORR = Op.getOpcode() == ARMISD::VORRIMM;
20062 APInt Imm(DecEltBits, DecodedVal);
20063
20064 Known.One = IsVORR ? (KnownLHS.One | Imm) : (KnownLHS.One & ~Imm);
20065 Known.Zero = IsVORR ? (KnownLHS.Zero & ~Imm) : (KnownLHS.Zero | Imm);
20066 break;
20067 }
20068 }
20069}
20070
20072 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
20073 TargetLoweringOpt &TLO) const {
20074 // Delay optimization, so we don't have to deal with illegal types, or block
20075 // optimizations.
20076 if (!TLO.LegalOps)
20077 return false;
20078
20079 // Only optimize AND for now.
20080 if (Op.getOpcode() != ISD::AND)
20081 return false;
20082
20083 EVT VT = Op.getValueType();
20084
20085 // Ignore vectors.
20086 if (VT.isVector())
20087 return false;
20088
20089 assert(VT == MVT::i32 && "Unexpected integer type");
20090
20091 // Make sure the RHS really is a constant.
20092 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20093 if (!C)
20094 return false;
20095
20096 unsigned Mask = C->getZExtValue();
20097
20098 unsigned Demanded = DemandedBits.getZExtValue();
20099 unsigned ShrunkMask = Mask & Demanded;
20100 unsigned ExpandedMask = Mask | ~Demanded;
20101
20102 // If the mask is all zeros, let the target-independent code replace the
20103 // result with zero.
20104 if (ShrunkMask == 0)
20105 return false;
20106
20107 // If the mask is all ones, erase the AND. (Currently, the target-independent
20108 // code won't do this, so we have to do it explicitly to avoid an infinite
20109 // loop in obscure cases.)
20110 if (ExpandedMask == ~0U)
20111 return TLO.CombineTo(Op, Op.getOperand(0));
20112
20113 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
20114 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
20115 };
20116 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
20117 if (NewMask == Mask)
20118 return true;
20119 SDLoc DL(Op);
20120 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
20121 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
20122 return TLO.CombineTo(Op, NewOp);
20123 };
20124
20125 // Prefer uxtb mask.
20126 if (IsLegalMask(0xFF))
20127 return UseMask(0xFF);
20128
20129 // Prefer uxth mask.
20130 if (IsLegalMask(0xFFFF))
20131 return UseMask(0xFFFF);
20132
20133 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
20134 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20135 if (ShrunkMask < 256)
20136 return UseMask(ShrunkMask);
20137
20138 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
20139 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20140 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
20141 return UseMask(ExpandedMask);
20142
20143 // Potential improvements:
20144 //
20145 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
20146 // We could try to prefer Thumb1 immediates which can be lowered to a
20147 // two-instruction sequence.
20148 // We could try to recognize more legal ARM/Thumb2 immediates here.
20149
20150 return false;
20151}
20152
20154 SDValue Op, const APInt &OriginalDemandedBits,
20155 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
20156 unsigned Depth) const {
20157 unsigned Opc = Op.getOpcode();
20158
20159 switch (Opc) {
20160 case ARMISD::ASRL:
20161 case ARMISD::LSRL: {
20162 // If this is result 0 and the other result is unused, see if the demand
20163 // bits allow us to shrink this long shift into a standard small shift in
20164 // the opposite direction.
20165 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
20166 isa<ConstantSDNode>(Op->getOperand(2))) {
20167 unsigned ShAmt = Op->getConstantOperandVal(2);
20168 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
20169 << (32 - ShAmt)))
20170 return TLO.CombineTo(
20171 Op, TLO.DAG.getNode(
20172 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
20173 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
20174 }
20175 break;
20176 }
20177 case ARMISD::VBICIMM: {
20178 SDValue Op0 = Op.getOperand(0);
20179 unsigned ModImm = Op.getConstantOperandVal(1);
20180 unsigned EltBits = 0;
20181 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
20182 if ((OriginalDemandedBits & Mask) == 0)
20183 return TLO.CombineTo(Op, Op0);
20184 }
20185 }
20186
20188 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
20189}
20190
20191//===----------------------------------------------------------------------===//
20192// ARM Inline Assembly Support
20193//===----------------------------------------------------------------------===//
20194
20195const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
20196 // At this point, we have to lower this constraint to something else, so we
20197 // lower it to an "r" or "w". However, by doing this we will force the result
20198 // to be in register, while the X constraint is much more permissive.
20199 //
20200 // Although we are correct (we are free to emit anything, without
20201 // constraints), we might break use cases that would expect us to be more
20202 // efficient and emit something else.
20203 if (!Subtarget->hasVFP2Base())
20204 return "r";
20205 if (ConstraintVT.isFloatingPoint())
20206 return "w";
20207 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20208 (ConstraintVT.getSizeInBits() == 64 ||
20209 ConstraintVT.getSizeInBits() == 128))
20210 return "w";
20211
20212 return "r";
20213}
20214
20215/// getConstraintType - Given a constraint letter, return the type of
20216/// constraint it is for this target.
20219 unsigned S = Constraint.size();
20220 if (S == 1) {
20221 switch (Constraint[0]) {
20222 default: break;
20223 case 'l': return C_RegisterClass;
20224 case 'w': return C_RegisterClass;
20225 case 'h': return C_RegisterClass;
20226 case 'x': return C_RegisterClass;
20227 case 't': return C_RegisterClass;
20228 case 'j': return C_Immediate; // Constant for movw.
20229 // An address with a single base register. Due to the way we
20230 // currently handle addresses it is the same as an 'r' memory constraint.
20231 case 'Q': return C_Memory;
20232 }
20233 } else if (S == 2) {
20234 switch (Constraint[0]) {
20235 default: break;
20236 case 'T': return C_RegisterClass;
20237 // All 'U+' constraints are addresses.
20238 case 'U': return C_Memory;
20239 }
20240 }
20241 return TargetLowering::getConstraintType(Constraint);
20242}
20243
20244/// Examine constraint type and operand type and determine a weight value.
20245/// This object must already have been set up with the operand type
20246/// and the current alternative constraint selected.
20249 AsmOperandInfo &info, const char *constraint) const {
20251 Value *CallOperandVal = info.CallOperandVal;
20252 // If we don't have a value, we can't do a match,
20253 // but allow it at the lowest weight.
20254 if (!CallOperandVal)
20255 return CW_Default;
20256 Type *type = CallOperandVal->getType();
20257 // Look at the constraint type.
20258 switch (*constraint) {
20259 default:
20261 break;
20262 case 'l':
20263 if (type->isIntegerTy()) {
20264 if (Subtarget->isThumb())
20265 weight = CW_SpecificReg;
20266 else
20267 weight = CW_Register;
20268 }
20269 break;
20270 case 'w':
20271 if (type->isFloatingPointTy())
20272 weight = CW_Register;
20273 break;
20274 }
20275 return weight;
20276}
20277
20278static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
20279 if (PR == 0 || VT == MVT::Other)
20280 return false;
20281 if (ARM::SPRRegClass.contains(PR))
20282 return VT != MVT::f32 && VT != MVT::f16 && VT != MVT::i32;
20283 if (ARM::DPRRegClass.contains(PR))
20284 return VT != MVT::f64 && !VT.is64BitVector();
20285 return false;
20286}
20287
20288using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20289
20291 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20292 switch (Constraint.size()) {
20293 case 1:
20294 // GCC ARM Constraint Letters
20295 switch (Constraint[0]) {
20296 case 'l': // Low regs or general regs.
20297 if (Subtarget->isThumb())
20298 return RCPair(0U, &ARM::tGPRRegClass);
20299 return RCPair(0U, &ARM::GPRRegClass);
20300 case 'h': // High regs or no regs.
20301 if (Subtarget->isThumb())
20302 return RCPair(0U, &ARM::hGPRRegClass);
20303 break;
20304 case 'r':
20305 if (Subtarget->isThumb1Only())
20306 return RCPair(0U, &ARM::tGPRRegClass);
20307 return RCPair(0U, &ARM::GPRRegClass);
20308 case 'w':
20309 if (VT == MVT::Other)
20310 break;
20311 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20312 return RCPair(0U, &ARM::SPRRegClass);
20313 if (VT.getSizeInBits() == 64)
20314 return RCPair(0U, &ARM::DPRRegClass);
20315 if (VT.getSizeInBits() == 128)
20316 return RCPair(0U, &ARM::QPRRegClass);
20317 break;
20318 case 'x':
20319 if (VT == MVT::Other)
20320 break;
20321 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20322 return RCPair(0U, &ARM::SPR_8RegClass);
20323 if (VT.getSizeInBits() == 64)
20324 return RCPair(0U, &ARM::DPR_8RegClass);
20325 if (VT.getSizeInBits() == 128)
20326 return RCPair(0U, &ARM::QPR_8RegClass);
20327 break;
20328 case 't':
20329 if (VT == MVT::Other)
20330 break;
20331 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20332 return RCPair(0U, &ARM::SPRRegClass);
20333 if (VT.getSizeInBits() == 64)
20334 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20335 if (VT.getSizeInBits() == 128)
20336 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20337 break;
20338 }
20339 break;
20340
20341 case 2:
20342 if (Constraint[0] == 'T') {
20343 switch (Constraint[1]) {
20344 default:
20345 break;
20346 case 'e':
20347 return RCPair(0U, &ARM::tGPREvenRegClass);
20348 case 'o':
20349 return RCPair(0U, &ARM::tGPROddRegClass);
20350 }
20351 }
20352 break;
20353
20354 default:
20355 break;
20356 }
20357
20358 if (StringRef("{cc}").equals_insensitive(Constraint))
20359 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20360
20361 auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20362 if (isIncompatibleReg(RCP.first, VT))
20363 return {0, nullptr};
20364 return RCP;
20365}
20366
20367/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20368/// vector. If it is invalid, don't add anything to Ops.
20370 StringRef Constraint,
20371 std::vector<SDValue> &Ops,
20372 SelectionDAG &DAG) const {
20373 SDValue Result;
20374
20375 // Currently only support length 1 constraints.
20376 if (Constraint.size() != 1)
20377 return;
20378
20379 char ConstraintLetter = Constraint[0];
20380 switch (ConstraintLetter) {
20381 default: break;
20382 case 'j':
20383 case 'I': case 'J': case 'K': case 'L':
20384 case 'M': case 'N': case 'O':
20386 if (!C)
20387 return;
20388
20389 int64_t CVal64 = C->getSExtValue();
20390 int CVal = (int) CVal64;
20391 // None of these constraints allow values larger than 32 bits. Check
20392 // that the value fits in an int.
20393 if (CVal != CVal64)
20394 return;
20395
20396 switch (ConstraintLetter) {
20397 case 'j':
20398 // Constant suitable for movw, must be between 0 and
20399 // 65535.
20400 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20401 if (CVal >= 0 && CVal <= 65535)
20402 break;
20403 return;
20404 case 'I':
20405 if (Subtarget->isThumb1Only()) {
20406 // This must be a constant between 0 and 255, for ADD
20407 // immediates.
20408 if (CVal >= 0 && CVal <= 255)
20409 break;
20410 } else if (Subtarget->isThumb2()) {
20411 // A constant that can be used as an immediate value in a
20412 // data-processing instruction.
20413 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20414 break;
20415 } else {
20416 // A constant that can be used as an immediate value in a
20417 // data-processing instruction.
20418 if (ARM_AM::getSOImmVal(CVal) != -1)
20419 break;
20420 }
20421 return;
20422
20423 case 'J':
20424 if (Subtarget->isThumb1Only()) {
20425 // This must be a constant between -255 and -1, for negated ADD
20426 // immediates. This can be used in GCC with an "n" modifier that
20427 // prints the negated value, for use with SUB instructions. It is
20428 // not useful otherwise but is implemented for compatibility.
20429 if (CVal >= -255 && CVal <= -1)
20430 break;
20431 } else {
20432 // This must be a constant between -4095 and 4095. This is suitable
20433 // for use as the immediate offset field in LDR and STR instructions
20434 // such as LDR r0,[r1,#offset].
20435 if (CVal >= -4095 && CVal <= 4095)
20436 break;
20437 }
20438 return;
20439
20440 case 'K':
20441 if (Subtarget->isThumb1Only()) {
20442 // A 32-bit value where only one byte has a nonzero value. Exclude
20443 // zero to match GCC. This constraint is used by GCC internally for
20444 // constants that can be loaded with a move/shift combination.
20445 // It is not useful otherwise but is implemented for compatibility.
20446 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20447 break;
20448 } else if (Subtarget->isThumb2()) {
20449 // A constant whose bitwise inverse can be used as an immediate
20450 // value in a data-processing instruction. This can be used in GCC
20451 // with a "B" modifier that prints the inverted value, for use with
20452 // BIC and MVN instructions. It is not useful otherwise but is
20453 // implemented for compatibility.
20454 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20455 break;
20456 } else {
20457 // A constant whose bitwise inverse can be used as an immediate
20458 // value in a data-processing instruction. This can be used in GCC
20459 // with a "B" modifier that prints the inverted value, for use with
20460 // BIC and MVN instructions. It is not useful otherwise but is
20461 // implemented for compatibility.
20462 if (ARM_AM::getSOImmVal(~CVal) != -1)
20463 break;
20464 }
20465 return;
20466
20467 case 'L':
20468 if (Subtarget->isThumb1Only()) {
20469 // This must be a constant between -7 and 7,
20470 // for 3-operand ADD/SUB immediate instructions.
20471 if (CVal >= -7 && CVal < 7)
20472 break;
20473 } else if (Subtarget->isThumb2()) {
20474 // A constant whose negation can be used as an immediate value in a
20475 // data-processing instruction. This can be used in GCC with an "n"
20476 // modifier that prints the negated value, for use with SUB
20477 // instructions. It is not useful otherwise but is implemented for
20478 // compatibility.
20479 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20480 break;
20481 } else {
20482 // A constant whose negation can be used as an immediate value in a
20483 // data-processing instruction. This can be used in GCC with an "n"
20484 // modifier that prints the negated value, for use with SUB
20485 // instructions. It is not useful otherwise but is implemented for
20486 // compatibility.
20487 if (ARM_AM::getSOImmVal(-CVal) != -1)
20488 break;
20489 }
20490 return;
20491
20492 case 'M':
20493 if (Subtarget->isThumb1Only()) {
20494 // This must be a multiple of 4 between 0 and 1020, for
20495 // ADD sp + immediate.
20496 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20497 break;
20498 } else {
20499 // A power of two or a constant between 0 and 32. This is used in
20500 // GCC for the shift amount on shifted register operands, but it is
20501 // useful in general for any shift amounts.
20502 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20503 break;
20504 }
20505 return;
20506
20507 case 'N':
20508 if (Subtarget->isThumb1Only()) {
20509 // This must be a constant between 0 and 31, for shift amounts.
20510 if (CVal >= 0 && CVal <= 31)
20511 break;
20512 }
20513 return;
20514
20515 case 'O':
20516 if (Subtarget->isThumb1Only()) {
20517 // This must be a multiple of 4 between -508 and 508, for
20518 // ADD/SUB sp = sp + immediate.
20519 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20520 break;
20521 }
20522 return;
20523 }
20524 Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType());
20525 break;
20526 }
20527
20528 if (Result.getNode()) {
20529 Ops.push_back(Result);
20530 return;
20531 }
20532 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20533}
20534
20535static RTLIB::Libcall getDivRemLibcall(
20536 const SDNode *N, MVT::SimpleValueType SVT) {
20537 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20538 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20539 "Unhandled Opcode in getDivRemLibcall");
20540 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20541 N->getOpcode() == ISD::SREM;
20542 RTLIB::Libcall LC;
20543 switch (SVT) {
20544 default: llvm_unreachable("Unexpected request for libcall!");
20545 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20546 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20547 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20548 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20549 }
20550 return LC;
20551}
20552
20554 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20555 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20556 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20557 "Unhandled Opcode in getDivRemArgList");
20558 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20559 N->getOpcode() == ISD::SREM;
20561 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20562 EVT ArgVT = N->getOperand(i).getValueType();
20563 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20564 TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
20565 Entry.IsSExt = isSigned;
20566 Entry.IsZExt = !isSigned;
20567 Args.push_back(Entry);
20568 }
20569 if (Subtarget->isTargetWindows() && Args.size() >= 2)
20570 std::swap(Args[0], Args[1]);
20571 return Args;
20572}
20573
20574SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20575 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20576 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20577 Subtarget->isTargetWindows()) &&
20578 "Register-based DivRem lowering only");
20579 unsigned Opcode = Op->getOpcode();
20580 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20581 "Invalid opcode for Div/Rem lowering");
20582 bool isSigned = (Opcode == ISD::SDIVREM);
20583 EVT VT = Op->getValueType(0);
20584 SDLoc dl(Op);
20585
20586 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20588 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20589 SDValue Res0 =
20590 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20591 SDValue Res1 =
20592 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20593 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20594 {Res0, Res1});
20595 }
20596 }
20597
20598 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20599
20600 // If the target has hardware divide, use divide + multiply + subtract:
20601 // div = a / b
20602 // rem = a - b * div
20603 // return {div, rem}
20604 // This should be lowered into UDIV/SDIV + MLS later on.
20605 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20606 : Subtarget->hasDivideInARMMode();
20607 if (hasDivide && Op->getValueType(0).isSimple() &&
20608 Op->getSimpleValueType(0) == MVT::i32) {
20609 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20610 const SDValue Dividend = Op->getOperand(0);
20611 const SDValue Divisor = Op->getOperand(1);
20612 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20613 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20614 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20615
20616 SDValue Values[2] = {Div, Rem};
20617 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20618 }
20619
20620 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20621 VT.getSimpleVT().SimpleTy);
20622 SDValue InChain = DAG.getEntryNode();
20623
20625 DAG.getContext(),
20626 Subtarget);
20627
20630
20631 Type *RetTy = StructType::get(Ty, Ty);
20632
20633 if (Subtarget->isTargetWindows())
20634 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20635
20636 TargetLowering::CallLoweringInfo CLI(DAG);
20637 CLI.setDebugLoc(dl).setChain(InChain)
20638 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
20640
20641 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20642 return CallInfo.first;
20643}
20644
20645// Lowers REM using divmod helpers
20646// see RTABI section 4.2/4.3
20647SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20648 EVT VT = N->getValueType(0);
20649
20650 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20652 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20653 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20654 Result[0], Result[1]);
20655 }
20656
20657 // Build return types (div and rem)
20658 std::vector<Type*> RetTyParams;
20659 Type *RetTyElement;
20660
20661 switch (VT.getSimpleVT().SimpleTy) {
20662 default: llvm_unreachable("Unexpected request for libcall!");
20663 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20664 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20665 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20666 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20667 }
20668
20669 RetTyParams.push_back(RetTyElement);
20670 RetTyParams.push_back(RetTyElement);
20671 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20672 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20673
20674 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20675 SimpleTy);
20676 SDValue InChain = DAG.getEntryNode();
20678 Subtarget);
20679 bool isSigned = N->getOpcode() == ISD::SREM;
20682
20683 if (Subtarget->isTargetWindows())
20684 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20685
20686 // Lower call
20687 CallLoweringInfo CLI(DAG);
20688 CLI.setChain(InChain)
20689 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
20691 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20692
20693 // Return second (rem) result operand (first contains div)
20694 SDNode *ResNode = CallResult.first.getNode();
20695 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20696 return ResNode->getOperand(1);
20697}
20698
20699SDValue
20700ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20701 assert(Subtarget->isTargetWindows() && "unsupported target platform");
20702 SDLoc DL(Op);
20703
20704 // Get the inputs.
20705 SDValue Chain = Op.getOperand(0);
20706 SDValue Size = Op.getOperand(1);
20707
20709 "no-stack-arg-probe")) {
20710 MaybeAlign Align =
20711 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
20712 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20713 Chain = SP.getValue(1);
20714 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20715 if (Align)
20716 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20717 DAG.getSignedConstant(-Align->value(), DL, MVT::i32));
20718 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20719 SDValue Ops[2] = { SP, Chain };
20720 return DAG.getMergeValues(Ops, DL);
20721 }
20722
20723 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20724 DAG.getConstant(2, DL, MVT::i32));
20725
20726 SDValue Glue;
20727 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20728 Glue = Chain.getValue(1);
20729
20730 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20731 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
20732
20733 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20734 Chain = NewSP.getValue(1);
20735
20736 SDValue Ops[2] = { NewSP, Chain };
20737 return DAG.getMergeValues(Ops, DL);
20738}
20739
20740SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20741 bool IsStrict = Op->isStrictFPOpcode();
20742 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20743 const unsigned DstSz = Op.getValueType().getSizeInBits();
20744 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
20745 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
20746 "Unexpected type for custom-lowering FP_EXTEND");
20747
20748 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20749 "With both FP DP and 16, any FP conversion is legal!");
20750
20751 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
20752 "With FP16, 16 to 32 conversion is legal!");
20753
20754 // Converting from 32 -> 64 is valid if we have FP64.
20755 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
20756 // FIXME: Remove this when we have strict fp instruction selection patterns
20757 if (IsStrict) {
20758 SDLoc Loc(Op);
20759 SDValue Result = DAG.getNode(ISD::FP_EXTEND,
20760 Loc, Op.getValueType(), SrcVal);
20761 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
20762 }
20763 return Op;
20764 }
20765
20766 // Either we are converting from 16 -> 64, without FP16 and/or
20767 // FP.double-precision or without Armv8-fp. So we must do it in two
20768 // steps.
20769 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
20770 // without FP16. So we must do a function call.
20771 SDLoc Loc(Op);
20772 RTLIB::Libcall LC;
20773 MakeLibCallOptions CallOptions;
20774 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20775 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
20776 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
20777 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
20778 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
20779 if (Supported) {
20780 if (IsStrict) {
20781 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
20782 {DstVT, MVT::Other}, {Chain, SrcVal});
20783 Chain = SrcVal.getValue(1);
20784 } else {
20785 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
20786 }
20787 } else {
20788 LC = RTLIB::getFPEXT(SrcVT, DstVT);
20789 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20790 "Unexpected type for custom-lowering FP_EXTEND");
20791 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20792 Loc, Chain);
20793 }
20794 }
20795
20796 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
20797}
20798
20799SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20800 bool IsStrict = Op->isStrictFPOpcode();
20801
20802 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20803 EVT SrcVT = SrcVal.getValueType();
20804 EVT DstVT = Op.getValueType();
20805 const unsigned DstSz = Op.getValueType().getSizeInBits();
20806 const unsigned SrcSz = SrcVT.getSizeInBits();
20807 (void)DstSz;
20808 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
20809 "Unexpected type for custom-lowering FP_ROUND");
20810
20811 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20812 "With both FP DP and 16, any FP conversion is legal!");
20813
20814 SDLoc Loc(Op);
20815
20816 // Instruction from 32 -> 16 if hasFP16 is valid
20817 if (SrcSz == 32 && Subtarget->hasFP16())
20818 return Op;
20819
20820 // Lib call from 32 -> 16 / 64 -> [32, 16]
20821 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
20822 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20823 "Unexpected type for custom-lowering FP_ROUND");
20824 MakeLibCallOptions CallOptions;
20825 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20827 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20828 Loc, Chain);
20829 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
20830}
20831
20832bool
20834 // The ARM target isn't yet aware of offsets.
20835 return false;
20836}
20837
20839 if (v == 0xffffffff)
20840 return false;
20841
20842 // there can be 1's on either or both "outsides", all the "inside"
20843 // bits must be 0's
20844 return isShiftedMask_32(~v);
20845}
20846
20847/// isFPImmLegal - Returns true if the target can instruction select the
20848/// specified FP immediate natively. If false, the legalizer will
20849/// materialize the FP immediate as a load from a constant pool.
20851 bool ForCodeSize) const {
20852 if (!Subtarget->hasVFP3Base())
20853 return false;
20854 if (VT == MVT::f16 && Subtarget->hasFullFP16())
20855 return ARM_AM::getFP16Imm(Imm) != -1;
20856 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
20857 ARM_AM::getFP32FP16Imm(Imm) != -1)
20858 return true;
20859 if (VT == MVT::f32)
20860 return ARM_AM::getFP32Imm(Imm) != -1;
20861 if (VT == MVT::f64 && Subtarget->hasFP64())
20862 return ARM_AM::getFP64Imm(Imm) != -1;
20863 return false;
20864}
20865
20866/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
20867/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
20868/// specified in the intrinsic calls.
20870 const CallInst &I,
20871 MachineFunction &MF,
20872 unsigned Intrinsic) const {
20873 switch (Intrinsic) {
20874 case Intrinsic::arm_neon_vld1:
20875 case Intrinsic::arm_neon_vld2:
20876 case Intrinsic::arm_neon_vld3:
20877 case Intrinsic::arm_neon_vld4:
20878 case Intrinsic::arm_neon_vld2lane:
20879 case Intrinsic::arm_neon_vld3lane:
20880 case Intrinsic::arm_neon_vld4lane:
20881 case Intrinsic::arm_neon_vld2dup:
20882 case Intrinsic::arm_neon_vld3dup:
20883 case Intrinsic::arm_neon_vld4dup: {
20884 Info.opc = ISD::INTRINSIC_W_CHAIN;
20885 // Conservatively set memVT to the entire set of vectors loaded.
20886 auto &DL = I.getDataLayout();
20887 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20888 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20889 Info.ptrVal = I.getArgOperand(0);
20890 Info.offset = 0;
20891 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20892 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20893 // volatile loads with NEON intrinsics not supported
20894 Info.flags = MachineMemOperand::MOLoad;
20895 return true;
20896 }
20897 case Intrinsic::arm_neon_vld1x2:
20898 case Intrinsic::arm_neon_vld1x3:
20899 case Intrinsic::arm_neon_vld1x4: {
20900 Info.opc = ISD::INTRINSIC_W_CHAIN;
20901 // Conservatively set memVT to the entire set of vectors loaded.
20902 auto &DL = I.getDataLayout();
20903 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20904 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20905 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
20906 Info.offset = 0;
20907 Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
20908 // volatile loads with NEON intrinsics not supported
20909 Info.flags = MachineMemOperand::MOLoad;
20910 return true;
20911 }
20912 case Intrinsic::arm_neon_vst1:
20913 case Intrinsic::arm_neon_vst2:
20914 case Intrinsic::arm_neon_vst3:
20915 case Intrinsic::arm_neon_vst4:
20916 case Intrinsic::arm_neon_vst2lane:
20917 case Intrinsic::arm_neon_vst3lane:
20918 case Intrinsic::arm_neon_vst4lane: {
20919 Info.opc = ISD::INTRINSIC_VOID;
20920 // Conservatively set memVT to the entire set of vectors stored.
20921 auto &DL = I.getDataLayout();
20922 unsigned NumElts = 0;
20923 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20924 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20925 if (!ArgTy->isVectorTy())
20926 break;
20927 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20928 }
20929 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20930 Info.ptrVal = I.getArgOperand(0);
20931 Info.offset = 0;
20932 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20933 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20934 // volatile stores with NEON intrinsics not supported
20935 Info.flags = MachineMemOperand::MOStore;
20936 return true;
20937 }
20938 case Intrinsic::arm_neon_vst1x2:
20939 case Intrinsic::arm_neon_vst1x3:
20940 case Intrinsic::arm_neon_vst1x4: {
20941 Info.opc = ISD::INTRINSIC_VOID;
20942 // Conservatively set memVT to the entire set of vectors stored.
20943 auto &DL = I.getDataLayout();
20944 unsigned NumElts = 0;
20945 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20946 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20947 if (!ArgTy->isVectorTy())
20948 break;
20949 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20950 }
20951 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20952 Info.ptrVal = I.getArgOperand(0);
20953 Info.offset = 0;
20954 Info.align = I.getParamAlign(0).valueOrOne();
20955 // volatile stores with NEON intrinsics not supported
20956 Info.flags = MachineMemOperand::MOStore;
20957 return true;
20958 }
20959 case Intrinsic::arm_mve_vld2q:
20960 case Intrinsic::arm_mve_vld4q: {
20961 Info.opc = ISD::INTRINSIC_W_CHAIN;
20962 // Conservatively set memVT to the entire set of vectors loaded.
20963 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
20964 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
20965 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20966 Info.ptrVal = I.getArgOperand(0);
20967 Info.offset = 0;
20968 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20969 // volatile loads with MVE intrinsics not supported
20970 Info.flags = MachineMemOperand::MOLoad;
20971 return true;
20972 }
20973 case Intrinsic::arm_mve_vst2q:
20974 case Intrinsic::arm_mve_vst4q: {
20975 Info.opc = ISD::INTRINSIC_VOID;
20976 // Conservatively set memVT to the entire set of vectors stored.
20977 Type *VecTy = I.getArgOperand(1)->getType();
20978 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
20979 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20980 Info.ptrVal = I.getArgOperand(0);
20981 Info.offset = 0;
20982 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20983 // volatile stores with MVE intrinsics not supported
20984 Info.flags = MachineMemOperand::MOStore;
20985 return true;
20986 }
20987 case Intrinsic::arm_mve_vldr_gather_base:
20988 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
20989 Info.opc = ISD::INTRINSIC_W_CHAIN;
20990 Info.ptrVal = nullptr;
20991 Info.memVT = MVT::getVT(I.getType());
20992 Info.align = Align(1);
20993 Info.flags |= MachineMemOperand::MOLoad;
20994 return true;
20995 }
20996 case Intrinsic::arm_mve_vldr_gather_base_wb:
20997 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
20998 Info.opc = ISD::INTRINSIC_W_CHAIN;
20999 Info.ptrVal = nullptr;
21000 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
21001 Info.align = Align(1);
21002 Info.flags |= MachineMemOperand::MOLoad;
21003 return true;
21004 }
21005 case Intrinsic::arm_mve_vldr_gather_offset:
21006 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
21007 Info.opc = ISD::INTRINSIC_W_CHAIN;
21008 Info.ptrVal = nullptr;
21009 MVT DataVT = MVT::getVT(I.getType());
21010 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
21011 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21012 DataVT.getVectorNumElements());
21013 Info.align = Align(1);
21014 Info.flags |= MachineMemOperand::MOLoad;
21015 return true;
21016 }
21017 case Intrinsic::arm_mve_vstr_scatter_base:
21018 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
21019 Info.opc = ISD::INTRINSIC_VOID;
21020 Info.ptrVal = nullptr;
21021 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21022 Info.align = Align(1);
21023 Info.flags |= MachineMemOperand::MOStore;
21024 return true;
21025 }
21026 case Intrinsic::arm_mve_vstr_scatter_base_wb:
21027 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
21028 Info.opc = ISD::INTRINSIC_W_CHAIN;
21029 Info.ptrVal = nullptr;
21030 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21031 Info.align = Align(1);
21032 Info.flags |= MachineMemOperand::MOStore;
21033 return true;
21034 }
21035 case Intrinsic::arm_mve_vstr_scatter_offset:
21036 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
21037 Info.opc = ISD::INTRINSIC_VOID;
21038 Info.ptrVal = nullptr;
21039 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
21040 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
21041 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21042 DataVT.getVectorNumElements());
21043 Info.align = Align(1);
21044 Info.flags |= MachineMemOperand::MOStore;
21045 return true;
21046 }
21047 case Intrinsic::arm_ldaex:
21048 case Intrinsic::arm_ldrex: {
21049 auto &DL = I.getDataLayout();
21050 Type *ValTy = I.getParamElementType(0);
21051 Info.opc = ISD::INTRINSIC_W_CHAIN;
21052 Info.memVT = MVT::getVT(ValTy);
21053 Info.ptrVal = I.getArgOperand(0);
21054 Info.offset = 0;
21055 Info.align = DL.getABITypeAlign(ValTy);
21057 return true;
21058 }
21059 case Intrinsic::arm_stlex:
21060 case Intrinsic::arm_strex: {
21061 auto &DL = I.getDataLayout();
21062 Type *ValTy = I.getParamElementType(1);
21063 Info.opc = ISD::INTRINSIC_W_CHAIN;
21064 Info.memVT = MVT::getVT(ValTy);
21065 Info.ptrVal = I.getArgOperand(1);
21066 Info.offset = 0;
21067 Info.align = DL.getABITypeAlign(ValTy);
21069 return true;
21070 }
21071 case Intrinsic::arm_stlexd:
21072 case Intrinsic::arm_strexd:
21073 Info.opc = ISD::INTRINSIC_W_CHAIN;
21074 Info.memVT = MVT::i64;
21075 Info.ptrVal = I.getArgOperand(2);
21076 Info.offset = 0;
21077 Info.align = Align(8);
21079 return true;
21080
21081 case Intrinsic::arm_ldaexd:
21082 case Intrinsic::arm_ldrexd:
21083 Info.opc = ISD::INTRINSIC_W_CHAIN;
21084 Info.memVT = MVT::i64;
21085 Info.ptrVal = I.getArgOperand(0);
21086 Info.offset = 0;
21087 Info.align = Align(8);
21089 return true;
21090
21091 default:
21092 break;
21093 }
21094
21095 return false;
21096}
21097
21098/// Returns true if it is beneficial to convert a load of a constant
21099/// to just the constant itself.
21101 Type *Ty) const {
21102 assert(Ty->isIntegerTy());
21103
21104 unsigned Bits = Ty->getPrimitiveSizeInBits();
21105 if (Bits == 0 || Bits > 32)
21106 return false;
21107 return true;
21108}
21109
21111 unsigned Index) const {
21113 return false;
21114
21115 return (Index == 0 || Index == ResVT.getVectorNumElements());
21116}
21117
21119 ARM_MB::MemBOpt Domain) const {
21120 // First, if the target has no DMB, see what fallback we can use.
21121 if (!Subtarget->hasDataBarrier()) {
21122 // Some ARMv6 cpus can support data barriers with an mcr instruction.
21123 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
21124 // here.
21125 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
21126 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
21127 Builder.getInt32(0), Builder.getInt32(7),
21128 Builder.getInt32(10), Builder.getInt32(5)};
21129 return Builder.CreateIntrinsic(Intrinsic::arm_mcr, args);
21130 } else {
21131 // Instead of using barriers, atomic accesses on these subtargets use
21132 // libcalls.
21133 llvm_unreachable("makeDMB on a target so old that it has no barriers");
21134 }
21135 } else {
21136 // Only a full system barrier exists in the M-class architectures.
21137 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
21138 Constant *CDomain = Builder.getInt32(Domain);
21139 return Builder.CreateIntrinsic(Intrinsic::arm_dmb, CDomain);
21140 }
21141}
21142
21143// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
21145 Instruction *Inst,
21146 AtomicOrdering Ord) const {
21147 switch (Ord) {
21150 llvm_unreachable("Invalid fence: unordered/non-atomic");
21153 return nullptr; // Nothing to do
21155 if (!Inst->hasAtomicStore())
21156 return nullptr; // Nothing to do
21157 [[fallthrough]];
21160 if (Subtarget->preferISHSTBarriers())
21161 return makeDMB(Builder, ARM_MB::ISHST);
21162 // FIXME: add a comment with a link to documentation justifying this.
21163 else
21164 return makeDMB(Builder, ARM_MB::ISH);
21165 }
21166 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
21167}
21168
21170 Instruction *Inst,
21171 AtomicOrdering Ord) const {
21172 switch (Ord) {
21175 llvm_unreachable("Invalid fence: unordered/not-atomic");
21178 return nullptr; // Nothing to do
21182 return makeDMB(Builder, ARM_MB::ISH);
21183 }
21184 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
21185}
21186
21187// Loads and stores less than 64-bits are already atomic; ones above that
21188// are doomed anyway, so defer to the default libcall and blame the OS when
21189// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21190// anything for those.
21193 bool has64BitAtomicStore;
21194 if (Subtarget->isMClass())
21195 has64BitAtomicStore = false;
21196 else if (Subtarget->isThumb())
21197 has64BitAtomicStore = Subtarget->hasV7Ops();
21198 else
21199 has64BitAtomicStore = Subtarget->hasV6Ops();
21200
21201 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21202 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21204}
21205
21206// Loads and stores less than 64-bits are already atomic; ones above that
21207// are doomed anyway, so defer to the default libcall and blame the OS when
21208// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21209// anything for those.
21210// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21211// guarantee, see DDI0406C ARM architecture reference manual,
21212// sections A8.8.72-74 LDRD)
21215 bool has64BitAtomicLoad;
21216 if (Subtarget->isMClass())
21217 has64BitAtomicLoad = false;
21218 else if (Subtarget->isThumb())
21219 has64BitAtomicLoad = Subtarget->hasV7Ops();
21220 else
21221 has64BitAtomicLoad = Subtarget->hasV6Ops();
21222
21223 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21224 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21226}
21227
21228// For the real atomic operations, we have ldrex/strex up to 32 bits,
21229// and up to 64 bits on the non-M profiles
21232 if (AI->isFloatingPointOperation())
21234
21235 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21236 bool hasAtomicRMW;
21237 if (Subtarget->isMClass())
21238 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21239 else if (Subtarget->isThumb())
21240 hasAtomicRMW = Subtarget->hasV7Ops();
21241 else
21242 hasAtomicRMW = Subtarget->hasV6Ops();
21243 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21244 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21245 // implement atomicrmw without spilling. If the target address is also on
21246 // the stack and close enough to the spill slot, this can lead to a
21247 // situation where the monitor always gets cleared and the atomic operation
21248 // can never succeed. So at -O0 lower this operation to a CAS loop.
21249 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21252 }
21254}
21255
21256// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21257// bits, and up to 64 bits on the non-M profiles.
21260 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21261 // implement cmpxchg without spilling. If the address being exchanged is also
21262 // on the stack and close enough to the spill slot, this can lead to a
21263 // situation where the monitor always gets cleared and the atomic operation
21264 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21265 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21266 bool HasAtomicCmpXchg;
21267 if (Subtarget->isMClass())
21268 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21269 else if (Subtarget->isThumb())
21270 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21271 else
21272 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21273 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21274 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21277}
21278
21280 const Instruction *I) const {
21281 return InsertFencesForAtomic;
21282}
21283
21285 // ROPI/RWPI are not supported currently.
21286 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21287}
21288
21290 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
21291 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21292 if (SecurityCheckCookieLibcall == RTLIB::Unsupported)
21294
21295 // MSVC CRT has a global variable holding security cookie.
21296 M.getOrInsertGlobal("__security_cookie",
21297 PointerType::getUnqual(M.getContext()));
21298
21299 // MSVC CRT has a function to validate security cookie.
21300 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
21301 getLibcallImplName(SecurityCheckCookieLibcall),
21302 Type::getVoidTy(M.getContext()), PointerType::getUnqual(M.getContext()));
21303 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21304 F->addParamAttr(0, Attribute::AttrKind::InReg);
21305}
21306
21308 // MSVC CRT has a function to validate security cookie.
21309 RTLIB::LibcallImpl SecurityCheckCookie =
21310 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21311 if (SecurityCheckCookie != RTLIB::Unsupported)
21312 return M.getFunction(getLibcallImplName(SecurityCheckCookie));
21314}
21315
21317 unsigned &Cost) const {
21318 // If we do not have NEON, vector types are not natively supported.
21319 if (!Subtarget->hasNEON())
21320 return false;
21321
21322 // Floating point values and vector values map to the same register file.
21323 // Therefore, although we could do a store extract of a vector type, this is
21324 // better to leave at float as we have more freedom in the addressing mode for
21325 // those.
21326 if (VectorTy->isFPOrFPVectorTy())
21327 return false;
21328
21329 // If the index is unknown at compile time, this is very expensive to lower
21330 // and it is not possible to combine the store with the extract.
21331 if (!isa<ConstantInt>(Idx))
21332 return false;
21333
21334 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21335 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21336 // We can do a store + vector extract on any vector that fits perfectly in a D
21337 // or Q register.
21338 if (BitWidth == 64 || BitWidth == 128) {
21339 Cost = 0;
21340 return true;
21341 }
21342 return false;
21343}
21344
21346 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
21347 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
21348 unsigned Opcode = Op.getOpcode();
21349 switch (Opcode) {
21350 case ARMISD::VORRIMM:
21351 case ARMISD::VBICIMM:
21352 return false;
21353 }
21355 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
21356}
21357
21359 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21360}
21361
21363 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21364}
21365
21367 const Instruction &AndI) const {
21368 if (!Subtarget->hasV7Ops())
21369 return false;
21370
21371 // Sink the `and` instruction only if the mask would fit into a modified
21372 // immediate operand.
21374 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21375 return false;
21376 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21377 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21378 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21379}
21380
21383 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21384 if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
21387 ExpansionFactor);
21388}
21389
21391 Value *Addr,
21392 AtomicOrdering Ord) const {
21393 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21394 bool IsAcquire = isAcquireOrStronger(Ord);
21395
21396 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21397 // intrinsic must return {i32, i32} and we have to recombine them into a
21398 // single i64 here.
21399 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21401 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21402
21403 Value *LoHi =
21404 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
21405
21406 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21407 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21408 if (!Subtarget->isLittle())
21409 std::swap (Lo, Hi);
21410 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21411 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21412 return Builder.CreateOr(
21413 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21414 }
21415
21416 Type *Tys[] = { Addr->getType() };
21417 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21418 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
21419
21420 CI->addParamAttr(
21421 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21422 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21423}
21424
21426 IRBuilderBase &Builder) const {
21427 if (!Subtarget->hasV7Ops())
21428 return;
21429 Builder.CreateIntrinsic(Intrinsic::arm_clrex, {});
21430}
21431
21433 Value *Val, Value *Addr,
21434 AtomicOrdering Ord) const {
21435 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21436 bool IsRelease = isReleaseOrStronger(Ord);
21437
21438 // Since the intrinsics must have legal type, the i64 intrinsics take two
21439 // parameters: "i32, i32". We must marshal Val into the appropriate form
21440 // before the call.
21441 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21443 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21444 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21445
21446 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21447 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21448 if (!Subtarget->isLittle())
21449 std::swap(Lo, Hi);
21450 return Builder.CreateIntrinsic(Int, {Lo, Hi, Addr});
21451 }
21452
21453 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21454 Type *Tys[] = { Addr->getType() };
21456
21457 CallInst *CI = Builder.CreateCall(
21458 Strex, {Builder.CreateZExtOrBitCast(
21459 Val, Strex->getFunctionType()->getParamType(0)),
21460 Addr});
21461 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21462 Val->getType()));
21463 return CI;
21464}
21465
21466
21468 return Subtarget->isMClass();
21469}
21470
21471/// A helper function for determining the number of interleaved accesses we
21472/// will generate when lowering accesses of the given type.
21473unsigned
21475 const DataLayout &DL) const {
21476 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21477}
21478
21480 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21481 const DataLayout &DL) const {
21482
21483 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21484 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21485
21486 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21487 return false;
21488
21489 // Ensure the vector doesn't have f16 elements. Even though we could do an
21490 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21491 // f32.
21492 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21493 return false;
21494 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21495 return false;
21496
21497 // Ensure the number of vector elements is greater than 1.
21498 if (VecTy->getNumElements() < 2)
21499 return false;
21500
21501 // Ensure the element type is legal.
21502 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21503 return false;
21504 // And the alignment if high enough under MVE.
21505 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21506 return false;
21507
21508 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21509 // 128 will be split into multiple interleaved accesses.
21510 if (Subtarget->hasNEON() && VecSize == 64)
21511 return true;
21512 return VecSize % 128 == 0;
21513}
21514
21516 if (Subtarget->hasNEON())
21517 return 4;
21518 if (Subtarget->hasMVEIntegerOps())
21521}
21522
21523/// Lower an interleaved load into a vldN intrinsic.
21524///
21525/// E.g. Lower an interleaved load (Factor = 2):
21526/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21527/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21528/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21529///
21530/// Into:
21531/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21532/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21533/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21535 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
21536 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
21537 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21538 "Invalid interleave factor");
21539 assert(!Shuffles.empty() && "Empty shufflevector input");
21540 assert(Shuffles.size() == Indices.size() &&
21541 "Unmatched number of shufflevectors and indices");
21542
21543 auto *LI = dyn_cast<LoadInst>(Load);
21544 if (!LI)
21545 return false;
21546 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
21547
21548 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21549 Type *EltTy = VecTy->getElementType();
21550
21551 const DataLayout &DL = LI->getDataLayout();
21552 Align Alignment = LI->getAlign();
21553
21554 // Skip if we do not have NEON and skip illegal vector types. We can
21555 // "legalize" wide vector types into multiple interleaved accesses as long as
21556 // the vector types are divisible by 128.
21557 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21558 return false;
21559
21560 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21561
21562 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21563 // load integer vectors first and then convert to pointer vectors.
21564 if (EltTy->isPointerTy())
21565 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21566
21567 IRBuilder<> Builder(LI);
21568
21569 // The base address of the load.
21570 Value *BaseAddr = LI->getPointerOperand();
21571
21572 if (NumLoads > 1) {
21573 // If we're going to generate more than one load, reset the sub-vector type
21574 // to something legal.
21575 VecTy = FixedVectorType::get(VecTy->getElementType(),
21576 VecTy->getNumElements() / NumLoads);
21577 }
21578
21579 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21580
21581 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21582 if (Subtarget->hasNEON()) {
21583 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21584 Type *Tys[] = {VecTy, PtrTy};
21585 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21586 Intrinsic::arm_neon_vld3,
21587 Intrinsic::arm_neon_vld4};
21588
21590 Ops.push_back(BaseAddr);
21591 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21592
21593 return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops,
21594 /*FMFSource=*/nullptr, "vldN");
21595 } else {
21596 assert((Factor == 2 || Factor == 4) &&
21597 "expected interleave factor of 2 or 4 for MVE");
21598 Intrinsic::ID LoadInts =
21599 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21600 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21601 Type *Tys[] = {VecTy, PtrTy};
21602
21604 Ops.push_back(BaseAddr);
21605 return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr,
21606 "vldN");
21607 }
21608 };
21609
21610 // Holds sub-vectors extracted from the load intrinsic return values. The
21611 // sub-vectors are associated with the shufflevector instructions they will
21612 // replace.
21614
21615 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21616 // If we're generating more than one load, compute the base address of
21617 // subsequent loads as an offset from the previous.
21618 if (LoadCount > 0)
21619 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21620 VecTy->getNumElements() * Factor);
21621
21622 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21623
21624 // Replace uses of each shufflevector with the corresponding vector loaded
21625 // by ldN.
21626 for (unsigned i = 0; i < Shuffles.size(); i++) {
21627 ShuffleVectorInst *SV = Shuffles[i];
21628 unsigned Index = Indices[i];
21629
21630 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21631
21632 // Convert the integer vector to pointer vector if the element is pointer.
21633 if (EltTy->isPointerTy())
21634 SubVec = Builder.CreateIntToPtr(
21635 SubVec,
21637
21638 SubVecs[SV].push_back(SubVec);
21639 }
21640 }
21641
21642 // Replace uses of the shufflevector instructions with the sub-vectors
21643 // returned by the load intrinsic. If a shufflevector instruction is
21644 // associated with more than one sub-vector, those sub-vectors will be
21645 // concatenated into a single wide vector.
21646 for (ShuffleVectorInst *SVI : Shuffles) {
21647 auto &SubVec = SubVecs[SVI];
21648 auto *WideVec =
21649 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21650 SVI->replaceAllUsesWith(WideVec);
21651 }
21652
21653 return true;
21654}
21655
21656/// Lower an interleaved store into a vstN intrinsic.
21657///
21658/// E.g. Lower an interleaved store (Factor = 3):
21659/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21660/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21661/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21662///
21663/// Into:
21664/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21665/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21666/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21667/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21668///
21669/// Note that the new shufflevectors will be removed and we'll only generate one
21670/// vst3 instruction in CodeGen.
21671///
21672/// Example for a more general valid mask (Factor 3). Lower:
21673/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21674/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21675/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21676///
21677/// Into:
21678/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21679/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21680/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21681/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21683 Value *LaneMask,
21684 ShuffleVectorInst *SVI,
21685 unsigned Factor,
21686 const APInt &GapMask) const {
21687 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21688 "Invalid interleave factor");
21689 auto *SI = dyn_cast<StoreInst>(Store);
21690 if (!SI)
21691 return false;
21692 assert(!LaneMask && GapMask.popcount() == Factor &&
21693 "Unexpected mask on store");
21694
21695 auto *VecTy = cast<FixedVectorType>(SVI->getType());
21696 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21697
21698 unsigned LaneLen = VecTy->getNumElements() / Factor;
21699 Type *EltTy = VecTy->getElementType();
21700 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
21701
21702 const DataLayout &DL = SI->getDataLayout();
21703 Align Alignment = SI->getAlign();
21704
21705 // Skip if we do not have NEON and skip illegal vector types. We can
21706 // "legalize" wide vector types into multiple interleaved accesses as long as
21707 // the vector types are divisible by 128.
21708 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
21709 return false;
21710
21711 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
21712
21713 Value *Op0 = SVI->getOperand(0);
21714 Value *Op1 = SVI->getOperand(1);
21715 IRBuilder<> Builder(SI);
21716
21717 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21718 // vectors to integer vectors.
21719 if (EltTy->isPointerTy()) {
21720 Type *IntTy = DL.getIntPtrType(EltTy);
21721
21722 // Convert to the corresponding integer vector.
21723 auto *IntVecTy =
21725 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
21726 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
21727
21728 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
21729 }
21730
21731 // The base address of the store.
21732 Value *BaseAddr = SI->getPointerOperand();
21733
21734 if (NumStores > 1) {
21735 // If we're going to generate more than one store, reset the lane length
21736 // and sub-vector type to something legal.
21737 LaneLen /= NumStores;
21738 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
21739 }
21740
21741 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
21742
21743 auto Mask = SVI->getShuffleMask();
21744
21745 auto createStoreIntrinsic = [&](Value *BaseAddr,
21746 SmallVectorImpl<Value *> &Shuffles) {
21747 if (Subtarget->hasNEON()) {
21748 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
21749 Intrinsic::arm_neon_vst3,
21750 Intrinsic::arm_neon_vst4};
21751 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21752 Type *Tys[] = {PtrTy, SubVecTy};
21753
21755 Ops.push_back(BaseAddr);
21756 append_range(Ops, Shuffles);
21757 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
21758 Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops);
21759 } else {
21760 assert((Factor == 2 || Factor == 4) &&
21761 "expected interleave factor of 2 or 4 for MVE");
21762 Intrinsic::ID StoreInts =
21763 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
21764 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21765 Type *Tys[] = {PtrTy, SubVecTy};
21766
21768 Ops.push_back(BaseAddr);
21769 append_range(Ops, Shuffles);
21770 for (unsigned F = 0; F < Factor; F++) {
21771 Ops.push_back(Builder.getInt32(F));
21772 Builder.CreateIntrinsic(StoreInts, Tys, Ops);
21773 Ops.pop_back();
21774 }
21775 }
21776 };
21777
21778 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
21779 // If we generating more than one store, we compute the base address of
21780 // subsequent stores as an offset from the previous.
21781 if (StoreCount > 0)
21782 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
21783 BaseAddr, LaneLen * Factor);
21784
21785 SmallVector<Value *, 4> Shuffles;
21786
21787 // Split the shufflevector operands into sub vectors for the new vstN call.
21788 for (unsigned i = 0; i < Factor; i++) {
21789 unsigned IdxI = StoreCount * LaneLen * Factor + i;
21790 if (Mask[IdxI] >= 0) {
21791 Shuffles.push_back(Builder.CreateShuffleVector(
21792 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
21793 } else {
21794 unsigned StartMask = 0;
21795 for (unsigned j = 1; j < LaneLen; j++) {
21796 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
21797 if (Mask[IdxJ * Factor + IdxI] >= 0) {
21798 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
21799 break;
21800 }
21801 }
21802 // Note: If all elements in a chunk are undefs, StartMask=0!
21803 // Note: Filling undef gaps with random elements is ok, since
21804 // those elements were being written anyway (with undefs).
21805 // In the case of all undefs we're defaulting to using elems from 0
21806 // Note: StartMask cannot be negative, it's checked in
21807 // isReInterleaveMask
21808 Shuffles.push_back(Builder.CreateShuffleVector(
21809 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
21810 }
21811 }
21812
21813 createStoreIntrinsic(BaseAddr, Shuffles);
21814 }
21815 return true;
21816}
21817
21825
21827 uint64_t &Members) {
21828 if (auto *ST = dyn_cast<StructType>(Ty)) {
21829 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
21830 uint64_t SubMembers = 0;
21831 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
21832 return false;
21833 Members += SubMembers;
21834 }
21835 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
21836 uint64_t SubMembers = 0;
21837 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
21838 return false;
21839 Members += SubMembers * AT->getNumElements();
21840 } else if (Ty->isFloatTy()) {
21841 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
21842 return false;
21843 Members = 1;
21844 Base = HA_FLOAT;
21845 } else if (Ty->isDoubleTy()) {
21846 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
21847 return false;
21848 Members = 1;
21849 Base = HA_DOUBLE;
21850 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
21851 Members = 1;
21852 switch (Base) {
21853 case HA_FLOAT:
21854 case HA_DOUBLE:
21855 return false;
21856 case HA_VECT64:
21857 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
21858 case HA_VECT128:
21859 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
21860 case HA_UNKNOWN:
21861 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
21862 case 64:
21863 Base = HA_VECT64;
21864 return true;
21865 case 128:
21866 Base = HA_VECT128;
21867 return true;
21868 default:
21869 return false;
21870 }
21871 }
21872 }
21873
21874 return (Members > 0 && Members <= 4);
21875}
21876
21877/// Return the correct alignment for the current calling convention.
21879 Type *ArgTy, const DataLayout &DL) const {
21880 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
21881 if (!ArgTy->isVectorTy())
21882 return ABITypeAlign;
21883
21884 // Avoid over-aligning vector parameters. It would require realigning the
21885 // stack and waste space for no real benefit.
21886 MaybeAlign StackAlign = DL.getStackAlignment();
21887 assert(StackAlign && "data layout string is missing stack alignment");
21888 return std::min(ABITypeAlign, *StackAlign);
21889}
21890
21891/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
21892/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
21893/// passing according to AAPCS rules.
21895 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
21896 const DataLayout &DL) const {
21897 if (getEffectiveCallingConv(CallConv, isVarArg) !=
21899 return false;
21900
21902 uint64_t Members = 0;
21903 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
21904 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
21905
21906 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
21907 return IsHA || IsIntArray;
21908}
21909
21911 const Constant *PersonalityFn) const {
21912 // Platforms which do not use SjLj EH may return values in these registers
21913 // via the personality function.
21915 return EM == ExceptionHandling::SjLj ? Register() : ARM::R0;
21916}
21917
21919 const Constant *PersonalityFn) const {
21920 // Platforms which do not use SjLj EH may return values in these registers
21921 // via the personality function.
21923 return EM == ExceptionHandling::SjLj ? Register() : ARM::R1;
21924}
21925
21926void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
21927 // Update IsSplitCSR in ARMFunctionInfo.
21928 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
21929 AFI->setIsSplitCSR(true);
21930}
21931
21932void ARMTargetLowering::insertCopiesSplitCSR(
21933 MachineBasicBlock *Entry,
21934 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
21935 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
21936 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
21937 if (!IStart)
21938 return;
21939
21940 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21941 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
21942 MachineBasicBlock::iterator MBBI = Entry->begin();
21943 for (const MCPhysReg *I = IStart; *I; ++I) {
21944 const TargetRegisterClass *RC = nullptr;
21945 if (ARM::GPRRegClass.contains(*I))
21946 RC = &ARM::GPRRegClass;
21947 else if (ARM::DPRRegClass.contains(*I))
21948 RC = &ARM::DPRRegClass;
21949 else
21950 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
21951
21952 Register NewVR = MRI->createVirtualRegister(RC);
21953 // Create copy from CSR to a virtual register.
21954 // FIXME: this currently does not emit CFI pseudo-instructions, it works
21955 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
21956 // nounwind. If we want to generalize this later, we may need to emit
21957 // CFI pseudo-instructions.
21958 assert(Entry->getParent()->getFunction().hasFnAttribute(
21959 Attribute::NoUnwind) &&
21960 "Function should be nounwind in insertCopiesSplitCSR!");
21961 Entry->addLiveIn(*I);
21962 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
21963 .addReg(*I);
21964
21965 // Insert the copy-back instructions right before the terminator.
21966 for (auto *Exit : Exits)
21967 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
21968 TII->get(TargetOpcode::COPY), *I)
21969 .addReg(NewVR);
21970 }
21971}
21972
21977
21979 return Subtarget->hasMVEIntegerOps();
21980}
21981
21984 auto *VTy = dyn_cast<FixedVectorType>(Ty);
21985 if (!VTy)
21986 return false;
21987
21988 auto *ScalarTy = VTy->getScalarType();
21989 unsigned NumElements = VTy->getNumElements();
21990
21991 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
21992 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
21993 return false;
21994
21995 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
21996 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
21997 return Subtarget->hasMVEFloatOps();
21998
22000 return false;
22001
22002 return Subtarget->hasMVEIntegerOps() &&
22003 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
22004 ScalarTy->isIntegerTy(32));
22005}
22006
22009 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
22010 Value *Accumulator) const {
22011
22013
22014 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
22015
22016 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
22017
22018 if (TyWidth > 128) {
22019 int Stride = Ty->getNumElements() / 2;
22020 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
22021 auto SplitSeqVec = llvm::to_vector(SplitSeq);
22022 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
22023 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
22024
22025 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
22026 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
22027 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
22028 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
22029 Value *LowerSplitAcc = nullptr;
22030 Value *UpperSplitAcc = nullptr;
22031
22032 if (Accumulator) {
22033 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
22034 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
22035 }
22036
22037 auto *LowerSplitInt = createComplexDeinterleavingIR(
22038 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
22039 auto *UpperSplitInt = createComplexDeinterleavingIR(
22040 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
22041
22042 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
22043 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
22044 }
22045
22046 auto *IntTy = Type::getInt32Ty(B.getContext());
22047
22048 ConstantInt *ConstRotation = nullptr;
22049 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
22050 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
22051
22052 if (Accumulator)
22053 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
22054 {ConstRotation, Accumulator, InputB, InputA});
22055 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
22056 {ConstRotation, InputB, InputA});
22057 }
22058
22059 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
22060 // 1 means the value is not halved.
22061 auto *ConstHalving = ConstantInt::get(IntTy, 1);
22062
22064 ConstRotation = ConstantInt::get(IntTy, 0);
22066 ConstRotation = ConstantInt::get(IntTy, 1);
22067
22068 if (!ConstRotation)
22069 return nullptr; // Invalid rotation for arm_mve_vcaddq
22070
22071 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
22072 {ConstHalving, ConstRotation, InputA, InputB});
22073 }
22074
22075 return nullptr;
22076}
unsigned const MachineRegisterInfo * MRI
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
return SDValue()
static const MCPhysReg GPRArgRegs[]
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static bool isNegatedInteger(SDValue Op)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
constexpr LLT F64
constexpr LLT S1
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT)
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, SDValue V1)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V)
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static SDValue performNegCMovCombine(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG)
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
cl::opt< unsigned > ArmMaxBaseUpdatesToCheck("arm-max-base-updates-to-check", cl::Hidden, cl::desc("Maximum number of base-updates to check generating postindex."), cl::init(64))
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
Function Alias Analysis false
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, dxil::ResourceTypeInfo &RTI)
static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI, Value *Offset, dxil::ResourceTypeInfo &RTI)
This file defines the DenseMap class.
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
#define MAKE_CASE(V)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
PowerPC Reduce CR logical Operation
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
static cl::opt< unsigned > MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192), cl::desc("DAG combiner limit number of steps when searching DAG " "for predecessor nodes"))
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static bool isIntrinsic(const CallBase &Call, Intrinsic::ID ID)
The Input class is used to parse a yaml document into in-memory structs and vectors.
bool getExactInverse(APFloat *Inv) const
If this value is normal and has an exact, normal, multiplicative inverse, store it in inv and return ...
Definition APFloat.cpp:5999
APInt bitcastToAPInt() const
Definition APFloat.h:1353
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition APFloat.h:1332
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1201
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
unsigned logBase2() const
Definition APInt.h:1761
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:475
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
virtual const ARMBaseRegisterInfo & getRegisterInfo() const =0
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
const Triple & getTargetTriple() const
const ARMBaseInstrInfo * getInstrInfo() const override
bool isThumb1Only() const
bool useFPVFMx() const
bool isThumb2() const
bool isTargetWindows() const
bool hasBaseDSP() const
const ARMTargetLowering * getTargetLowering() const override
const ARMBaseRegisterInfo * getRegisterInfo() const override
bool hasVFP2Base() const
bool useFPVFMx64() const
bool isLittle() const
bool useFPVFMx16() const
bool isMClass() const
bool useMulOps() const
Align getDualLoadStoreAlignment() const
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a vstN intrinsic.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a vldN intrinsic.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy,Idx).
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
const ARMBaseTargetMachine & getTM() const
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
bool isFloatingPointOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
static LLVM_ABI BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
The address of a basic block.
Definition Constants.h:899
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void addInRegsParamInfo(unsigned RegBegin, unsigned RegEnd)
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
unsigned getValNo() const
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:715
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:277
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:207
bool isBigEndian() const
Definition DataLayout.h:208
MaybeAlign getStackAlignment() const
Returns the natural stack alignment, or MaybeAlign() if one wasn't specified.
Definition DataLayout.h:237
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
StringRef getPrivateGlobalPrefix() const
Definition DataLayout.h:295
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
arg_iterator arg_begin()
Definition Function.h:866
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition Function.h:687
const Argument * const_arg_iterator
Definition Function.h:73
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition Function.h:227
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
bool isDSOLocal() const
bool hasExternalWeakLinkage() const
bool hasDLLImportStorageClass() const
Module * getParent()
Get the module that this global value is contained inside of...
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2783
LLVM_ABI bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
bool is64BitVector() const
Return true if this is a 64-bit vector type.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI void moveAfter(MachineBasicBlock *NewBefore)
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
LLVM_ABI void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Wrapper class representing virtual and physical registers.
Definition Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
DenormalMode getDenormalMode(EVT VT) const
Return the current function's default denormal handling kind for the given floating point type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool empty() const
Definition SmallSet.h:168
bool erase(const T &V)
Definition SmallSet.h:199
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
const unsigned char * bytes_end() const
Definition StringRef.h:127
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
const unsigned char * bytes_begin() const
Definition StringRef.h:124
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl)
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
ExceptionHandling getExceptionModel() const
Return the ExceptionHandling to use, considering TargetOptions and the Triple's default.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition Triple.h:437
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:296
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
Base class of all SIMD vector types.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
const unsigned FPReservedBits
const unsigned RoundingBitsPos
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition ISDOpcodes.h:726
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2060
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:279
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:264
ExceptionHandling
Definition CodeGen.h:53
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:252
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1518
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition MathExtras.h:276
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
CombineLevel
Definition DAGCombine.h:15
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr U AbsoluteValue(T X)
Return the absolute value of a signed integer, converted to the corresponding unsigned integer type.
Definition MathExtras.h:603
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1963
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:257
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned gettBLXrOpcode(const MachineFunction &MF)
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:207
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
static constexpr roundingMode rmTowardZero
Definition APFloat.h:308
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isFixedLengthVector() const
Definition ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
EVT ArgVT
Usually the non-legalized type of the argument, which is the EVT corresponding to the OrigTy IR type.
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:135
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setDiscardResult(bool Value=true)
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...