LLVM 22.0.0git
AsmLexer.cpp
Go to the documentation of this file.
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This class implements the lexer for assembly files.
10//
11//===----------------------------------------------------------------------===//
12
14#include "llvm/ADT/APInt.h"
15#include "llvm/ADT/ArrayRef.h"
17#include "llvm/ADT/StringRef.h"
18#include "llvm/MC/MCAsmInfo.h"
21#include "llvm/Support/SMLoc.h"
24#include <cassert>
25#include <cctype>
26#include <cstdio>
27#include <cstring>
28#include <string>
29
30using namespace llvm;
31
33
35 return SMLoc::getFromPointer(Str.data() + Str.size());
36}
37
39
41 switch (Kind) {
42 case AsmToken::Error:
43 OS << "error";
44 break;
46 OS << "identifier: " << getString();
47 break;
49 OS << "int: " << getString();
50 break;
51 case AsmToken::Real:
52 OS << "real: " << getString();
53 break;
55 OS << "string: " << getString();
56 break;
57
58 // clang-format off
59 case AsmToken::Amp: OS << "Amp"; break;
60 case AsmToken::AmpAmp: OS << "AmpAmp"; break;
61 case AsmToken::At: OS << "At"; break;
62 case AsmToken::BackSlash: OS << "BackSlash"; break;
63 case AsmToken::BigNum: OS << "BigNum"; break;
64 case AsmToken::Caret: OS << "Caret"; break;
65 case AsmToken::Colon: OS << "Colon"; break;
66 case AsmToken::Comma: OS << "Comma"; break;
67 case AsmToken::Comment: OS << "Comment"; break;
68 case AsmToken::Dollar: OS << "Dollar"; break;
69 case AsmToken::Dot: OS << "Dot"; break;
70 case AsmToken::EndOfStatement: OS << "EndOfStatement"; break;
71 case AsmToken::Eof: OS << "Eof"; break;
72 case AsmToken::Equal: OS << "Equal"; break;
73 case AsmToken::EqualEqual: OS << "EqualEqual"; break;
74 case AsmToken::Exclaim: OS << "Exclaim"; break;
75 case AsmToken::ExclaimEqual: OS << "ExclaimEqual"; break;
76 case AsmToken::Greater: OS << "Greater"; break;
77 case AsmToken::GreaterEqual: OS << "GreaterEqual"; break;
78 case AsmToken::GreaterGreater: OS << "GreaterGreater"; break;
79 case AsmToken::Hash: OS << "Hash"; break;
80 case AsmToken::HashDirective: OS << "HashDirective"; break;
81 case AsmToken::LBrac: OS << "LBrac"; break;
82 case AsmToken::LCurly: OS << "LCurly"; break;
83 case AsmToken::LParen: OS << "LParen"; break;
84 case AsmToken::Less: OS << "Less"; break;
85 case AsmToken::LessEqual: OS << "LessEqual"; break;
86 case AsmToken::LessGreater: OS << "LessGreater"; break;
87 case AsmToken::LessLess: OS << "LessLess"; break;
88 case AsmToken::Minus: OS << "Minus"; break;
89 case AsmToken::MinusGreater: OS << "MinusGreater"; break;
90 case AsmToken::Percent: OS << "Percent"; break;
91 case AsmToken::Pipe: OS << "Pipe"; break;
92 case AsmToken::PipePipe: OS << "PipePipe"; break;
93 case AsmToken::Plus: OS << "Plus"; break;
94 case AsmToken::Question: OS << "Question"; break;
95 case AsmToken::RBrac: OS << "RBrac"; break;
96 case AsmToken::RCurly: OS << "RCurly"; break;
97 case AsmToken::RParen: OS << "RParen"; break;
98 case AsmToken::Slash: OS << "Slash"; break;
99 case AsmToken::Space: OS << "Space"; break;
100 case AsmToken::Star: OS << "Star"; break;
101 case AsmToken::Tilde: OS << "Tilde"; break;
102 // clang-format on
103 }
104
105 // Print the token string.
106 OS << " (\"";
108 OS << "\")";
109}
110
111AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
112 // For COFF targets, this is true, while for ELF targets, it should be false.
113 // Currently, @specifier parsing depends on '@' being included in the token.
114 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with("@") &&
115 MAI.useAtForSpecifier();
116 LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
117
118 CurTok.emplace_back(AsmToken::Space, StringRef());
119}
120
121void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
122 bool EndStatementAtEOF) {
123 CurBuf = Buf;
124
125 if (ptr)
126 CurPtr = ptr;
127 else
128 CurPtr = CurBuf.begin();
129
130 TokStart = nullptr;
131 this->EndStatementAtEOF = EndStatementAtEOF;
132}
133
134/// ReturnError - Set the error to the specified string at the specified
135/// location. This is defined to always return AsmToken::Error.
136AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
137 SetError(SMLoc::getFromPointer(Loc), Msg);
138
139 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
140}
141
142int AsmLexer::getNextChar() {
143 if (CurPtr == CurBuf.end())
144 return EOF;
145 return (unsigned char)*CurPtr++;
146}
147
148int AsmLexer::peekNextChar() {
149 if (CurPtr == CurBuf.end())
150 return EOF;
151 return (unsigned char)*CurPtr;
152}
153
154/// The leading integral digit sequence and dot should have already been
155/// consumed, some or all of the fractional digit sequence *can* have been
156/// consumed.
157AsmToken AsmLexer::LexFloatLiteral() {
158 // Skip the fractional digit sequence.
159 while (isDigit(*CurPtr))
160 ++CurPtr;
161
162 if (*CurPtr == '-' || *CurPtr == '+')
163 return ReturnError(CurPtr, "invalid sign in float literal");
164
165 // Check for exponent
166 if ((*CurPtr == 'e' || *CurPtr == 'E')) {
167 ++CurPtr;
168
169 if (*CurPtr == '-' || *CurPtr == '+')
170 ++CurPtr;
171
172 while (isDigit(*CurPtr))
173 ++CurPtr;
174 }
175
177 StringRef(TokStart, CurPtr - TokStart));
178}
179
180/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
181/// while making sure there are enough actual digits around for the constant to
182/// be valid.
183///
184/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
185/// before we get here.
186AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
187 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
188 "unexpected parse state in floating hex");
189 bool NoFracDigits = true;
190
191 // Skip the fractional part if there is one
192 if (*CurPtr == '.') {
193 ++CurPtr;
194
195 const char *FracStart = CurPtr;
196 while (isHexDigit(*CurPtr))
197 ++CurPtr;
198
199 NoFracDigits = CurPtr == FracStart;
200 }
201
202 if (NoIntDigits && NoFracDigits)
203 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
204 "expected at least one significand digit");
205
206 // Make sure we do have some kind of proper exponent part
207 if (*CurPtr != 'p' && *CurPtr != 'P')
208 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
209 "expected exponent part 'p'");
210 ++CurPtr;
211
212 if (*CurPtr == '+' || *CurPtr == '-')
213 ++CurPtr;
214
215 // N.b. exponent digits are *not* hex
216 const char *ExpStart = CurPtr;
217 while (isDigit(*CurPtr))
218 ++CurPtr;
219
220 if (CurPtr == ExpStart)
221 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
222 "expected at least one exponent digit");
223
224 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
225}
226
227/// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
228static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
229 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
230 (AllowAt && C == '@') || (AllowHash && C == '#');
231}
232
233AsmToken AsmLexer::LexIdentifier() {
234 // Check for floating point literals.
235 if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
236 // Disambiguate a .1243foo identifier from a floating literal.
237 while (isDigit(*CurPtr))
238 ++CurPtr;
239
240 if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
241 AllowHashInIdentifier) ||
242 *CurPtr == 'e' || *CurPtr == 'E')
243 return LexFloatLiteral();
244 }
245
246 while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
247 ++CurPtr;
248
249 // Handle . as a special case.
250 if (CurPtr == TokStart+1 && TokStart[0] == '.')
251 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
252
253 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
254}
255
256/// LexSlash: Slash: /
257/// C-Style Comment: /* ... */
258/// C-style Comment: // ...
259AsmToken AsmLexer::LexSlash() {
261 IsAtStartOfStatement = false;
262 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
263 }
264
265 switch (*CurPtr) {
266 case '*':
267 IsAtStartOfStatement = false;
268 break; // C style comment.
269 case '/':
270 ++CurPtr;
271 return LexLineComment();
272 default:
273 IsAtStartOfStatement = false;
274 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
275 }
276
277 // C Style comment.
278 ++CurPtr; // skip the star.
279 const char *CommentTextStart = CurPtr;
280 while (CurPtr != CurBuf.end()) {
281 switch (*CurPtr++) {
282 case '*':
283 // End of the comment?
284 if (*CurPtr != '/')
285 break;
286 // If we have a CommentConsumer, notify it about the comment.
287 if (CommentConsumer) {
288 CommentConsumer->HandleComment(
289 SMLoc::getFromPointer(CommentTextStart),
290 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
291 }
292 ++CurPtr; // End the */.
294 StringRef(TokStart, CurPtr - TokStart));
295 }
296 }
297 return ReturnError(TokStart, "unterminated comment");
298}
299
300/// LexLineComment: Comment: #[^\n]*
301/// : //[^\n]*
302AsmToken AsmLexer::LexLineComment() {
303 // Mark This as an end of statement with a body of the
304 // comment. While it would be nicer to leave this two tokens,
305 // backwards compatability with TargetParsers makes keeping this in this form
306 // better.
307 const char *CommentTextStart = CurPtr;
308 int CurChar = getNextChar();
309 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
310 CurChar = getNextChar();
311 const char *NewlinePtr = CurPtr;
312 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
313 ++CurPtr;
314
315 // If we have a CommentConsumer, notify it about the comment.
316 if (CommentConsumer) {
317 CommentConsumer->HandleComment(
318 SMLoc::getFromPointer(CommentTextStart),
319 StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
320 }
321
322 IsAtStartOfLine = true;
323 // This is a whole line comment. leave newline
324 if (IsAtStartOfStatement)
326 StringRef(TokStart, CurPtr - TokStart));
327 IsAtStartOfStatement = true;
328
330 StringRef(TokStart, CurPtr - 1 - TokStart));
331}
332
333static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
334 // Skip case-insensitive ULL, UL, U, L and LL suffixes.
335 if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
336 ++CurPtr;
337 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
338 ++CurPtr;
339 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
340 ++CurPtr;
341}
342
343// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
344// integer as a hexadecimal, possibly with leading zeroes.
345static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
346 bool LexHex) {
347 const char *FirstNonDec = nullptr;
348 const char *LookAhead = CurPtr;
349 while (true) {
350 if (isDigit(*LookAhead)) {
351 ++LookAhead;
352 } else {
353 if (!FirstNonDec)
354 FirstNonDec = LookAhead;
355
356 // Keep going if we are looking for a 'h' suffix.
357 if (LexHex && isHexDigit(*LookAhead))
358 ++LookAhead;
359 else
360 break;
361 }
362 }
363 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
364 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
365 if (isHex)
366 return 16;
367 return DefaultRadix;
368}
369
370static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
371 while (hexDigitValue(*CurPtr) < DefaultRadix) {
372 ++CurPtr;
373 }
374 return CurPtr;
375}
376
378 if (Value.isIntN(64))
381}
382
383static std::string radixName(unsigned Radix) {
384 switch (Radix) {
385 case 2:
386 return "binary";
387 case 8:
388 return "octal";
389 case 10:
390 return "decimal";
391 case 16:
392 return "hexadecimal";
393 default:
394 return "base-" + std::to_string(Radix);
395 }
396}
397
398/// LexDigit: First character is [0-9].
399/// Local Label: [0-9][:]
400/// Forward/Backward Label: [0-9][fb]
401/// Binary integer: 0b[01]+
402/// Octal integer: 0[0-7]+
403/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
404/// Decimal integer: [1-9][0-9]*
405AsmToken AsmLexer::LexDigit() {
406 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
407 // MASM-flavor octal integer: [0-7]+[oOqQ]
408 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
409 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
410 if (LexMasmIntegers && isdigit(CurPtr[-1])) {
411 const char *FirstNonBinary =
412 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
413 const char *FirstNonDecimal =
414 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
415 const char *OldCurPtr = CurPtr;
416 while (isHexDigit(*CurPtr)) {
417 switch (*CurPtr) {
418 default:
419 if (!FirstNonDecimal) {
420 FirstNonDecimal = CurPtr;
421 }
422 [[fallthrough]];
423 case '9':
424 case '8':
425 case '7':
426 case '6':
427 case '5':
428 case '4':
429 case '3':
430 case '2':
431 if (!FirstNonBinary) {
432 FirstNonBinary = CurPtr;
433 }
434 break;
435 case '1':
436 case '0':
437 break;
438 }
439 ++CurPtr;
440 }
441 if (*CurPtr == '.') {
442 // MASM float literals (other than hex floats) always contain a ".", and
443 // are always written in decimal.
444 ++CurPtr;
445 return LexFloatLiteral();
446 }
447
448 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
449 ++CurPtr;
450 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
451 }
452
453 unsigned Radix = 0;
454 if (*CurPtr == 'h' || *CurPtr == 'H') {
455 // hexadecimal number
456 ++CurPtr;
457 Radix = 16;
458 } else if (*CurPtr == 't' || *CurPtr == 'T') {
459 // decimal number
460 ++CurPtr;
461 Radix = 10;
462 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
463 *CurPtr == 'Q') {
464 // octal number
465 ++CurPtr;
466 Radix = 8;
467 } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
468 // binary number
469 ++CurPtr;
470 Radix = 2;
471 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
472 DefaultRadix < 14 &&
473 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
474 Radix = 10;
475 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
476 DefaultRadix < 12 &&
477 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
478 Radix = 2;
479 }
480
481 if (Radix) {
482 StringRef Result(TokStart, CurPtr - TokStart);
483 APInt Value(128, 0, true);
484
485 if (Result.drop_back().getAsInteger(Radix, Value))
486 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
487
488 // MSVC accepts and ignores type suffices on integer literals.
490
491 return intToken(Result, Value);
492 }
493
494 // default-radix integers, or floating point numbers, fall through
495 CurPtr = OldCurPtr;
496 }
497
498 // MASM default-radix integers: [0-9a-fA-F]+
499 // (All other integer literals have a radix specifier.)
500 if (LexMasmIntegers && UseMasmDefaultRadix) {
501 CurPtr = findLastDigit(CurPtr, 16);
502 StringRef Result(TokStart, CurPtr - TokStart);
503
504 APInt Value(128, 0, true);
505 if (Result.getAsInteger(DefaultRadix, Value)) {
506 return ReturnError(TokStart,
507 "invalid " + radixName(DefaultRadix) + " number");
508 }
509
510 return intToken(Result, Value);
511 }
512
513 // Motorola hex integers: $[0-9a-fA-F]+
514 if (LexMotorolaIntegers && CurPtr[-1] == '$') {
515 const char *NumStart = CurPtr;
516 while (isHexDigit(CurPtr[0]))
517 ++CurPtr;
518
519 APInt Result(128, 0);
520 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result))
521 return ReturnError(TokStart, "invalid hexadecimal number");
522
523 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
524 }
525
526 // Motorola binary integers: %[01]+
527 if (LexMotorolaIntegers && CurPtr[-1] == '%') {
528 const char *NumStart = CurPtr;
529 while (*CurPtr == '0' || *CurPtr == '1')
530 ++CurPtr;
531
532 APInt Result(128, 0);
533 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result))
534 return ReturnError(TokStart, "invalid binary number");
535
536 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
537 }
538
539 // Decimal integer: [1-9][0-9]*
540 // HLASM-flavour decimal integer: [0-9][0-9]*
541 // FIXME: Later on, support for fb for HLASM has to be added in
542 // as they probably would be needed for asm goto
543 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
544 unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
545
546 if (!LexHLASMIntegers) {
547 bool IsHex = Radix == 16;
548 // Check for floating point literals.
549 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
550 if (*CurPtr == '.')
551 ++CurPtr;
552 return LexFloatLiteral();
553 }
554 }
555
556 StringRef Result(TokStart, CurPtr - TokStart);
557
558 APInt Value(128, 0, true);
559 if (Result.getAsInteger(Radix, Value))
560 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
561
562 if (!LexHLASMIntegers)
563 // The darwin/x86 (and x86-64) assembler accepts and ignores type
564 // suffices on integer literals.
566
567 return intToken(Result, Value);
568 }
569
570 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
571 ++CurPtr;
572 // See if we actually have "0b" as part of something like "jmp 0b\n"
573 if (!isDigit(CurPtr[0])) {
574 --CurPtr;
575 StringRef Result(TokStart, CurPtr - TokStart);
576 return AsmToken(AsmToken::Integer, Result, 0);
577 }
578 const char *NumStart = CurPtr;
579 while (CurPtr[0] == '0' || CurPtr[0] == '1')
580 ++CurPtr;
581
582 // Requires at least one binary digit.
583 if (CurPtr == NumStart)
584 return ReturnError(TokStart, "invalid binary number");
585
586 StringRef Result(TokStart, CurPtr - TokStart);
587
588 APInt Value(128, 0, true);
589 if (Result.substr(2).getAsInteger(2, Value))
590 return ReturnError(TokStart, "invalid binary number");
591
592 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
593 // suffixes on integer literals.
595
596 return intToken(Result, Value);
597 }
598
599 if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
600 ++CurPtr;
601 const char *NumStart = CurPtr;
602 while (isHexDigit(CurPtr[0]))
603 ++CurPtr;
604
605 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
606 // diagnosed by LexHexFloatLiteral).
607 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
608 return LexHexFloatLiteral(NumStart == CurPtr);
609
610 // Otherwise requires at least one hex digit.
611 if (CurPtr == NumStart)
612 return ReturnError(CurPtr-2, "invalid hexadecimal number");
613
614 APInt Result(128, 0);
615 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
616 return ReturnError(TokStart, "invalid hexadecimal number");
617
618 // Consume the optional [hH].
619 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
620 ++CurPtr;
621
622 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
623 // suffixes on integer literals.
625
626 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
627 }
628
629 // Either octal or hexadecimal.
630 APInt Value(128, 0, true);
631 unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
632 StringRef Result(TokStart, CurPtr - TokStart);
633 if (Result.getAsInteger(Radix, Value))
634 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
635
636 // Consume the [hH].
637 if (Radix == 16)
638 ++CurPtr;
639
640 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
641 // suffixes on integer literals.
643
644 return intToken(Result, Value);
645}
646
647/// LexSingleQuote: Integer: 'b'
648AsmToken AsmLexer::LexSingleQuote() {
649 int CurChar = getNextChar();
650
651 if (LexHLASMStrings)
652 return ReturnError(TokStart, "invalid usage of character literals");
653
654 if (LexMasmStrings) {
655 while (CurChar != EOF) {
656 if (CurChar != '\'') {
657 CurChar = getNextChar();
658 } else if (peekNextChar() == '\'') {
659 // In MASM single-quote strings, doubled single-quotes mean an escaped
660 // single quote, so should be lexed in.
661 (void)getNextChar();
662 CurChar = getNextChar();
663 } else {
664 break;
665 }
666 }
667 if (CurChar == EOF)
668 return ReturnError(TokStart, "unterminated string constant");
669 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
670 }
671
672 if (CurChar == '\\')
673 CurChar = getNextChar();
674
675 if (CurChar == EOF)
676 return ReturnError(TokStart, "unterminated single quote");
677
678 CurChar = getNextChar();
679
680 if (CurChar != '\'')
681 return ReturnError(TokStart, "single quote way too long");
682
683 // The idea here being that 'c' is basically just an integral
684 // constant.
685 StringRef Res = StringRef(TokStart,CurPtr - TokStart);
686 long long Value;
687
688 if (Res.starts_with("\'\\")) {
689 char theChar = Res[2];
690 switch (theChar) {
691 default: Value = theChar; break;
692 case '\'': Value = '\''; break;
693 case 't': Value = '\t'; break;
694 case 'n': Value = '\n'; break;
695 case 'b': Value = '\b'; break;
696 case 'f': Value = '\f'; break;
697 case 'r': Value = '\r'; break;
698 }
699 } else
700 Value = TokStart[1];
701
702 return AsmToken(AsmToken::Integer, Res, Value);
703}
704
705/// LexQuote: String: "..."
706AsmToken AsmLexer::LexQuote() {
707 int CurChar = getNextChar();
708 if (LexHLASMStrings)
709 return ReturnError(TokStart, "invalid usage of string literals");
710
711 if (LexMasmStrings) {
712 while (CurChar != EOF) {
713 if (CurChar != '"') {
714 CurChar = getNextChar();
715 } else if (peekNextChar() == '"') {
716 // In MASM double-quoted strings, doubled double-quotes mean an escaped
717 // double quote, so should be lexed in.
718 (void)getNextChar();
719 CurChar = getNextChar();
720 } else {
721 break;
722 }
723 }
724 if (CurChar == EOF)
725 return ReturnError(TokStart, "unterminated string constant");
726 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
727 }
728
729 while (CurChar != '"') {
730 if (CurChar == '\\') {
731 // Allow \", etc.
732 CurChar = getNextChar();
733 }
734
735 if (CurChar == EOF)
736 return ReturnError(TokStart, "unterminated string constant");
737
738 CurChar = getNextChar();
739 }
740
741 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
742}
743
745 TokStart = CurPtr;
746
747 while (!isAtStartOfComment(CurPtr) && // Start of line comment.
748 !isAtStatementSeparator(CurPtr) && // End of statement marker.
749 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
750 ++CurPtr;
751 }
752 return StringRef(TokStart, CurPtr-TokStart);
753}
754
755StringRef AsmLexer::LexUntilEndOfLine() {
756 TokStart = CurPtr;
757
758 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
759 ++CurPtr;
760 }
761 return StringRef(TokStart, CurPtr-TokStart);
762}
763
765 bool ShouldSkipSpace) {
766 SaveAndRestore SavedTokenStart(TokStart);
767 SaveAndRestore SavedCurPtr(CurPtr);
768 SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
769 SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
770 SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
771 SaveAndRestore SavedIsPeeking(IsPeeking, true);
772 std::string SavedErr = getErr();
773 SMLoc SavedErrLoc = getErrLoc();
774
775 size_t ReadCount;
776 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
777 AsmToken Token = LexToken();
778
779 Buf[ReadCount] = Token;
780
781 if (Token.is(AsmToken::Eof)) {
782 ReadCount++;
783 break;
784 }
785 }
786
787 SetError(SavedErrLoc, SavedErr);
788 return ReadCount;
789}
790
791bool AsmLexer::isAtStartOfComment(const char *Ptr) {
792 if (MAI.isHLASM() && !IsAtStartOfStatement)
793 return false;
794
795 StringRef CommentString = MAI.getCommentString();
796
797 if (CommentString.size() == 1)
798 return CommentString[0] == Ptr[0];
799
800 // Allow # preprocessor comments also be counted as comments for "##" cases
801 if (CommentString[1] == '#')
802 return CommentString[0] == Ptr[0];
803
804 return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0;
805}
806
807bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
808 return strncmp(Ptr, MAI.getSeparatorString(),
809 strlen(MAI.getSeparatorString())) == 0;
810}
811
812AsmToken AsmLexer::LexToken() {
813 TokStart = CurPtr;
814 // This always consumes at least one character.
815 int CurChar = getNextChar();
816
817 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
818 // If this starts with a '#', this may be a cpp
819 // hash directive and otherwise a line comment.
820 AsmToken TokenBuf[2];
821 MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
822 size_t num = peekTokens(Buf, true);
823 // There cannot be a space preceding this
824 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
825 TokenBuf[1].is(AsmToken::String)) {
826 CurPtr = TokStart; // reset curPtr;
827 StringRef s = LexUntilEndOfLine();
828 UnLex(TokenBuf[1]);
829 UnLex(TokenBuf[0]);
831 }
832
834 return LexLineComment();
835 }
836
837 if (isAtStartOfComment(TokStart)) {
838 CurPtr += MAI.getCommentString().size() - 1;
839 return LexLineComment();
840 }
841
842 if (isAtStatementSeparator(TokStart)) {
843 CurPtr += strlen(MAI.getSeparatorString()) - 1;
844 IsAtStartOfLine = true;
845 IsAtStartOfStatement = true;
847 StringRef(TokStart, strlen(MAI.getSeparatorString())));
848 }
849
850 // If we're missing a newline at EOF, make sure we still get an
851 // EndOfStatement token before the Eof token.
852 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
853 IsAtStartOfLine = true;
854 IsAtStartOfStatement = true;
855 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
856 }
857 IsAtStartOfLine = false;
858 bool OldIsAtStartOfStatement = IsAtStartOfStatement;
859 IsAtStartOfStatement = false;
860 switch (CurChar) {
861 default:
862 // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
863 // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
864 // an identifier is target-dependent. These characters are handled in the
865 // respective switch cases.
866 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
867 return LexIdentifier();
868
869 // Unknown character, emit an error.
870 return ReturnError(TokStart, "invalid character in input");
871 case EOF:
872 if (EndStatementAtEOF) {
873 IsAtStartOfLine = true;
874 IsAtStartOfStatement = true;
875 }
876 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
877 case 0:
878 case ' ':
879 case '\t':
880 IsAtStartOfStatement = OldIsAtStartOfStatement;
881 while (*CurPtr == ' ' || *CurPtr == '\t')
882 CurPtr++;
883 if (SkipSpace)
884 return LexToken(); // Ignore whitespace.
885 else
886 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
887 case '\r': {
888 IsAtStartOfLine = true;
889 IsAtStartOfStatement = true;
890 // If this is a CR followed by LF, treat that as one token.
891 if (CurPtr != CurBuf.end() && *CurPtr == '\n')
892 ++CurPtr;
894 StringRef(TokStart, CurPtr - TokStart));
895 }
896 case '\n':
897 IsAtStartOfLine = true;
898 IsAtStartOfStatement = true;
899 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
900 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
901 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
902 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
903 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
904 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
905 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
906 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
907 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
908 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
909 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
910 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
911 case '$': {
912 if (LexMotorolaIntegers && isHexDigit(*CurPtr))
913 return LexDigit();
915 return LexIdentifier();
916 return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
917 }
918 case '@':
920 return LexIdentifier();
921 return AsmToken(AsmToken::At, StringRef(TokStart, 1));
922 case '#':
923 if (MAI.isHLASM())
924 return LexIdentifier();
925 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
926 case '?':
928 return LexIdentifier();
929 return AsmToken(AsmToken::Question, StringRef(TokStart, 1));
930 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
931 case '=':
932 if (*CurPtr == '=') {
933 ++CurPtr;
934 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
935 }
936 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
937 case '-':
938 if (*CurPtr == '>') {
939 ++CurPtr;
940 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
941 }
942 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
943 case '|':
944 if (*CurPtr == '|') {
945 ++CurPtr;
946 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
947 }
948 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
949 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
950 case '&':
951 if (*CurPtr == '&') {
952 ++CurPtr;
953 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
954 }
955 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
956 case '!':
957 if (*CurPtr == '=') {
958 ++CurPtr;
959 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
960 }
961 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
962 case '%':
963 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
964 return LexDigit();
965 }
966 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
967 case '/':
968 IsAtStartOfStatement = OldIsAtStartOfStatement;
969 return LexSlash();
970 case '\'': return LexSingleQuote();
971 case '"': return LexQuote();
972 case '0': case '1': case '2': case '3': case '4':
973 case '5': case '6': case '7': case '8': case '9':
974 return LexDigit();
975 case '<':
976 switch (*CurPtr) {
977 case '<':
978 ++CurPtr;
979 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
980 case '=':
981 ++CurPtr;
982 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
983 case '>':
984 ++CurPtr;
985 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
986 default:
987 return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
988 }
989 case '>':
990 switch (*CurPtr) {
991 case '>':
992 ++CurPtr;
993 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
994 case '=':
995 ++CurPtr;
996 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
997 default:
998 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
999 }
1000
1001 // TODO: Quoted identifiers (objc methods etc)
1002 // local labels: [0-9][:]
1003 // Forward/backward labels: [0-9][fb]
1004 // Integers, fp constants, character constants.
1005 }
1006}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
static std::string radixName(unsigned Radix)
Definition: AsmLexer.cpp:383
static void SkipIgnoredIntegerSuffix(const char *&CurPtr)
Definition: AsmLexer.cpp:333
static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix, bool LexHex)
Definition: AsmLexer.cpp:345
static AsmToken intToken(StringRef Ref, APInt &Value)
Definition: AsmLexer.cpp:377
static const char * findLastDigit(const char *CurPtr, unsigned DefaultRadix)
Definition: AsmLexer.cpp:370
static bool isIdentifierChar(char C)
Return true if the given character satisfies the following regular expression: [-a-zA-Z$....
Definition: MILexer.cpp:118
static bool isDigit(const char C)
static bool isHexDigit(const char C)
raw_pwrite_stream & OS
This file provides utility classes that use RAII to save and restore values.
This file contains some functions that are useful when dealing with strings.
Class for arbitrary precision integers.
Definition: APInt.h:78
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
virtual void HandleComment(SMLoc Loc, StringRef CommentText)=0
Callback function for when a comment is lexed.
LLVM_ABI AsmLexer(const MCAsmInfo &MAI)
Definition: AsmLexer.cpp:111
void UnLex(AsmToken const &Token)
Definition: AsmLexer.h:106
bool is(AsmToken::TokenKind K) const
Check if the current token has kind K.
Definition: AsmLexer.h:147
SMLoc getErrLoc()
Get the current error location.
Definition: AsmLexer.h:138
const std::string & getErr()
Get the current error string.
Definition: AsmLexer.h:141
LLVM_ABI StringRef LexUntilEndOfStatement()
Definition: AsmLexer.cpp:744
LLVM_ABI void setBuffer(StringRef Buf, const char *ptr=nullptr, bool EndStatementAtEOF=true)
Definition: AsmLexer.cpp:121
LLVM_ABI size_t peekTokens(MutableArrayRef< AsmToken > Buf, bool ShouldSkipSpace=true)
Look ahead an arbitrary number of tokens.
Definition: AsmLexer.cpp:764
Target independent representation for an assembler token.
Definition: MCAsmMacro.h:22
LLVM_ABI SMLoc getLoc() const
Definition: AsmLexer.cpp:32
StringRef getString() const
Get the string for the current token, this includes all characters (for example, the quotes on string...
Definition: MCAsmMacro.h:103
bool is(TokenKind K) const
Definition: MCAsmMacro.h:75
LLVM_ABI SMLoc getEndLoc() const
Definition: AsmLexer.cpp:34
LLVM_ABI void dump(raw_ostream &OS) const
Definition: AsmLexer.cpp:40
LLVM_ABI SMRange getLocRange() const
Definition: AsmLexer.cpp:38
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition: MCAsmInfo.h:64
bool isHLASM() const
Definition: MCAsmInfo.h:520
bool useAtForSpecifier() const
Definition: MCAsmInfo.h:664
bool doesAllowDollarAtStartOfIdentifier() const
Definition: MCAsmInfo.h:569
bool shouldUseMotorolaIntegers() const
Definition: MCAsmInfo.h:719
StringRef getCommentString() const
Definition: MCAsmInfo.h:538
const char * getSeparatorString() const
Definition: MCAsmInfo.h:533
bool doesAllowAtAtStartOfIdentifier() const
Definition: MCAsmInfo.h:566
bool shouldAllowAdditionalComments() const
Definition: MCAsmInfo.h:539
bool doesAllowQuestionAtStartOfIdentifier() const
Definition: MCAsmInfo.h:563
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:303
Represents a location in source code.
Definition: SMLoc.h:23
static SMLoc getFromPointer(const char *Ptr)
Definition: SMLoc.h:36
Represents a range in source code.
Definition: SMLoc.h:48
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:269
iterator begin() const
Definition: StringRef.h:120
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:154
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:148
iterator end() const
Definition: StringRef.h:122
LLVM Value Representation.
Definition: Value.h:75
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
raw_ostream & write_escaped(StringRef Str, bool UseHexEscapes=false)
Output Str, turning '\', '\t', ' ', '"', and anything that doesn't satisfy llvm::isPrint into an esca...
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Ref
The access may reference the value stored in memory.
A utility class that uses RAII to save and restore the value of a variable.