| //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is distributed under the University of Illinois Open Source |
| // License. See LICENSE.TXT for details. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This class implements the lexer for assembly files. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "llvm/MC/MCParser/AsmLexer.h" |
| #include "llvm/Support/SMLoc.h" |
| #include "llvm/Support/MemoryBuffer.h" |
| #include "llvm/MC/MCAsmInfo.h" |
| #include <cctype> |
| #include <cerrno> |
| #include <cstdio> |
| #include <cstdlib> |
| using namespace llvm; |
| |
| AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) { |
| CurBuf = NULL; |
| CurPtr = NULL; |
| isAtStartOfLine = true; |
| } |
| |
| AsmLexer::~AsmLexer() { |
| } |
| |
| void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) { |
| CurBuf = buf; |
| |
| if (ptr) |
| CurPtr = ptr; |
| else |
| CurPtr = CurBuf->getBufferStart(); |
| |
| TokStart = 0; |
| } |
| |
| /// ReturnError - Set the error to the specified string at the specified |
| /// location. This is defined to always return AsmToken::Error. |
| AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { |
| SetError(SMLoc::getFromPointer(Loc), Msg); |
| |
| return AsmToken(AsmToken::Error, StringRef(Loc, 0)); |
| } |
| |
| int AsmLexer::getNextChar() { |
| char CurChar = *CurPtr++; |
| switch (CurChar) { |
| default: |
| return (unsigned char)CurChar; |
| case 0: |
| // A nul character in the stream is either the end of the current buffer or |
| // a random nul in the file. Disambiguate that here. |
| if (CurPtr-1 != CurBuf->getBufferEnd()) |
| return 0; // Just whitespace. |
| |
| // Otherwise, return end of file. |
| --CurPtr; // Another call to lex will return EOF again. |
| return EOF; |
| } |
| } |
| |
| /// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)? |
| /// |
| /// The leading integral digit sequence and dot should have already been |
| /// consumed, some or all of the fractional digit sequence *can* have been |
| /// consumed. |
| AsmToken AsmLexer::LexFloatLiteral() { |
| // Skip the fractional digit sequence. |
| while (isdigit(*CurPtr)) |
| ++CurPtr; |
| |
| // Check for exponent; we intentionally accept a slighlty wider set of |
| // literals here and rely on the upstream client to reject invalid ones (e.g., |
| // "1e+"). |
| if (*CurPtr == 'e' || *CurPtr == 'E') { |
| ++CurPtr; |
| if (*CurPtr == '-' || *CurPtr == '+') |
| ++CurPtr; |
| while (isdigit(*CurPtr)) |
| ++CurPtr; |
| } |
| |
| return AsmToken(AsmToken::Real, |
| StringRef(TokStart, CurPtr - TokStart)); |
| } |
| |
| /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* |
| static bool IsIdentifierChar(char c) { |
| return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@'; |
| } |
| AsmToken AsmLexer::LexIdentifier() { |
| // Check for floating point literals. |
| if (CurPtr[-1] == '.' && isdigit(*CurPtr)) { |
| // Disambiguate a .1243foo identifier from a floating literal. |
| while (isdigit(*CurPtr)) |
| ++CurPtr; |
| if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierChar(*CurPtr)) |
| return LexFloatLiteral(); |
| } |
| |
| while (IsIdentifierChar(*CurPtr)) |
| ++CurPtr; |
| |
| // Handle . as a special case. |
| if (CurPtr == TokStart+1 && TokStart[0] == '.') |
| return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); |
| |
| return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); |
| } |
| |
| /// LexSlash: Slash: / |
| /// C-Style Comment: /* ... */ |
| AsmToken AsmLexer::LexSlash() { |
| switch (*CurPtr) { |
| case '*': break; // C style comment. |
| case '/': return ++CurPtr, LexLineComment(); |
| default: return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1)); |
| } |
| |
| // C Style comment. |
| ++CurPtr; // skip the star. |
| while (1) { |
| int CurChar = getNextChar(); |
| switch (CurChar) { |
| case EOF: |
| return ReturnError(TokStart, "unterminated comment"); |
| case '*': |
| // End of the comment? |
| if (CurPtr[0] != '/') break; |
| |
| ++CurPtr; // End the */. |
| return LexToken(); |
| } |
| } |
| } |
| |
| /// LexLineComment: Comment: #[^\n]* |
| /// : //[^\n]* |
| AsmToken AsmLexer::LexLineComment() { |
| // FIXME: This is broken if we happen to a comment at the end of a file, which |
| // was .included, and which doesn't end with a newline. |
| int CurChar = getNextChar(); |
| while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) |
| CurChar = getNextChar(); |
| |
| if (CurChar == EOF) |
| return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0)); |
| return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0)); |
| } |
| |
| static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { |
| if (CurPtr[0] == 'L' && CurPtr[1] == 'L') |
| CurPtr += 2; |
| if (CurPtr[0] == 'U' && CurPtr[1] == 'L' && CurPtr[2] == 'L') |
| CurPtr += 3; |
| } |
| |
| /// LexDigit: First character is [0-9]. |
| /// Local Label: [0-9][:] |
| /// Forward/Backward Label: [0-9][fb] |
| /// Binary integer: 0b[01]+ |
| /// Octal integer: 0[0-7]+ |
| /// Hex integer: 0x[0-9a-fA-F]+ |
| /// Decimal integer: [1-9][0-9]* |
| AsmToken AsmLexer::LexDigit() { |
| // Decimal integer: [1-9][0-9]* |
| if (CurPtr[-1] != '0' || CurPtr[0] == '.') { |
| while (isdigit(*CurPtr)) |
| ++CurPtr; |
| |
| // Check for floating point literals. |
| if (*CurPtr == '.' || *CurPtr == 'e') { |
| ++CurPtr; |
| return LexFloatLiteral(); |
| } |
| |
| StringRef Result(TokStart, CurPtr - TokStart); |
| |
| long long Value; |
| if (Result.getAsInteger(10, Value)) { |
| // Allow positive values that are too large to fit into a signed 64-bit |
| // integer, but that do fit in an unsigned one, we just convert them over. |
| unsigned long long UValue; |
| if (Result.getAsInteger(10, UValue)) |
| return ReturnError(TokStart, "invalid decimal number"); |
| Value = (long long)UValue; |
| } |
| |
| // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL |
| // suffixes on integer literals. |
| SkipIgnoredIntegerSuffix(CurPtr); |
| |
| return AsmToken(AsmToken::Integer, Result, Value); |
| } |
| |
| if (*CurPtr == 'b') { |
| ++CurPtr; |
| // See if we actually have "0b" as part of something like "jmp 0b\n" |
| if (!isdigit(CurPtr[0])) { |
| --CurPtr; |
| StringRef Result(TokStart, CurPtr - TokStart); |
| return AsmToken(AsmToken::Integer, Result, 0); |
| } |
| const char *NumStart = CurPtr; |
| while (CurPtr[0] == '0' || CurPtr[0] == '1') |
| ++CurPtr; |
| |
| // Requires at least one binary digit. |
| if (CurPtr == NumStart) |
| return ReturnError(TokStart, "invalid binary number"); |
| |
| StringRef Result(TokStart, CurPtr - TokStart); |
| |
| long long Value; |
| if (Result.substr(2).getAsInteger(2, Value)) |
| return ReturnError(TokStart, "invalid binary number"); |
| |
| // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL |
| // suffixes on integer literals. |
| SkipIgnoredIntegerSuffix(CurPtr); |
| |
| return AsmToken(AsmToken::Integer, Result, Value); |
| } |
| |
| if (*CurPtr == 'x') { |
| ++CurPtr; |
| const char *NumStart = CurPtr; |
| while (isxdigit(CurPtr[0])) |
| ++CurPtr; |
| |
| // Requires at least one hex digit. |
| if (CurPtr == NumStart) |
| return ReturnError(CurPtr-2, "invalid hexadecimal number"); |
| |
| unsigned long long Result; |
| if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) |
| return ReturnError(TokStart, "invalid hexadecimal number"); |
| |
| // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL |
| // suffixes on integer literals. |
| SkipIgnoredIntegerSuffix(CurPtr); |
| |
| return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart), |
| (int64_t)Result); |
| } |
| |
| // Must be an octal number, it starts with 0. |
| while (*CurPtr >= '0' && *CurPtr <= '9') |
| ++CurPtr; |
| |
| StringRef Result(TokStart, CurPtr - TokStart); |
| long long Value; |
| if (Result.getAsInteger(8, Value)) |
| return ReturnError(TokStart, "invalid octal number"); |
| |
| // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL |
| // suffixes on integer literals. |
| SkipIgnoredIntegerSuffix(CurPtr); |
| |
| return AsmToken(AsmToken::Integer, Result, Value); |
| } |
| |
| /// LexSingleQuote: Integer: 'b' |
| AsmToken AsmLexer::LexSingleQuote() { |
| int CurChar = getNextChar(); |
| |
| if (CurChar == '\\') |
| CurChar = getNextChar(); |
| |
| if (CurChar == EOF) |
| return ReturnError(TokStart, "unterminated single quote"); |
| |
| CurChar = getNextChar(); |
| |
| if (CurChar != '\'') |
| return ReturnError(TokStart, "single quote way too long"); |
| |
| // The idea here being that 'c' is basically just an integral |
| // constant. |
| StringRef Res = StringRef(TokStart,CurPtr - TokStart); |
| long long Value; |
| |
| if (Res.startswith("\'\\")) { |
| char theChar = Res[2]; |
| switch (theChar) { |
| default: Value = theChar; break; |
| case '\'': Value = '\''; break; |
| case 't': Value = '\t'; break; |
| case 'n': Value = '\n'; break; |
| case 'b': Value = '\b'; break; |
| } |
| } else |
| Value = TokStart[1]; |
| |
| return AsmToken(AsmToken::Integer, Res, Value); |
| } |
| |
| |
| /// LexQuote: String: "..." |
| AsmToken AsmLexer::LexQuote() { |
| int CurChar = getNextChar(); |
| // TODO: does gas allow multiline string constants? |
| while (CurChar != '"') { |
| if (CurChar == '\\') { |
| // Allow \", etc. |
| CurChar = getNextChar(); |
| } |
| |
| if (CurChar == EOF) |
| return ReturnError(TokStart, "unterminated string constant"); |
| |
| CurChar = getNextChar(); |
| } |
| |
| return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); |
| } |
| |
| StringRef AsmLexer::LexUntilEndOfStatement() { |
| TokStart = CurPtr; |
| |
| while (!isAtStartOfComment(*CurPtr) && // Start of line comment. |
| !isAtStatementSeparator(CurPtr) && // End of statement marker. |
| *CurPtr != '\n' && |
| *CurPtr != '\r' && |
| (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { |
| ++CurPtr; |
| } |
| return StringRef(TokStart, CurPtr-TokStart); |
| } |
| |
| StringRef AsmLexer::LexUntilEndOfLine() { |
| TokStart = CurPtr; |
| |
| while (*CurPtr != '\n' && |
| *CurPtr != '\r' && |
| (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { |
| ++CurPtr; |
| } |
| return StringRef(TokStart, CurPtr-TokStart); |
| } |
| |
| bool AsmLexer::isAtStartOfComment(char Char) { |
| // FIXME: This won't work for multi-character comment indicators like "//". |
| return Char == *MAI.getCommentString(); |
| } |
| |
| bool AsmLexer::isAtStatementSeparator(const char *Ptr) { |
| return strncmp(Ptr, MAI.getSeparatorString(), |
| strlen(MAI.getSeparatorString())) == 0; |
| } |
| |
| AsmToken AsmLexer::LexToken() { |
| TokStart = CurPtr; |
| // This always consumes at least one character. |
| int CurChar = getNextChar(); |
| |
| if (isAtStartOfComment(CurChar)) { |
| // If this comment starts with a '#', then return the Hash token and let |
| // the assembler parser see if it can be parsed as a cpp line filename |
| // comment. We do this only if we are at the start of a line. |
| if (CurChar == '#' && isAtStartOfLine) |
| return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); |
| isAtStartOfLine = true; |
| return LexLineComment(); |
| } |
| if (isAtStatementSeparator(TokStart)) { |
| CurPtr += strlen(MAI.getSeparatorString()) - 1; |
| return AsmToken(AsmToken::EndOfStatement, |
| StringRef(TokStart, strlen(MAI.getSeparatorString()))); |
| } |
| |
| // If we're missing a newline at EOF, make sure we still get an |
| // EndOfStatement token before the Eof token. |
| if (CurChar == EOF && !isAtStartOfLine) { |
| isAtStartOfLine = true; |
| return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); |
| } |
| |
| isAtStartOfLine = false; |
| switch (CurChar) { |
| default: |
| // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* |
| if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') |
| return LexIdentifier(); |
| |
| // Unknown character, emit an error. |
| return ReturnError(TokStart, "invalid character in input"); |
| case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); |
| case 0: |
| case ' ': |
| case '\t': |
| // Ignore whitespace. |
| return LexToken(); |
| case '\n': // FALL THROUGH. |
| case '\r': |
| isAtStartOfLine = true; |
| return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); |
| case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); |
| case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); |
| case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); |
| case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); |
| case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); |
| case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); |
| case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); |
| case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); |
| case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); |
| case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); |
| case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); |
| case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); |
| case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); |
| case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); |
| case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); |
| case '=': |
| if (*CurPtr == '=') |
| return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); |
| return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); |
| case '|': |
| if (*CurPtr == '|') |
| return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); |
| return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); |
| case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); |
| case '&': |
| if (*CurPtr == '&') |
| return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); |
| return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); |
| case '!': |
| if (*CurPtr == '=') |
| return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); |
| return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); |
| case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); |
| case '/': return LexSlash(); |
| case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); |
| case '\'': return LexSingleQuote(); |
| case '"': return LexQuote(); |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| return LexDigit(); |
| case '<': |
| switch (*CurPtr) { |
| case '<': return ++CurPtr, AsmToken(AsmToken::LessLess, |
| StringRef(TokStart, 2)); |
| case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual, |
| StringRef(TokStart, 2)); |
| case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater, |
| StringRef(TokStart, 2)); |
| default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); |
| } |
| case '>': |
| switch (*CurPtr) { |
| case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater, |
| StringRef(TokStart, 2)); |
| case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual, |
| StringRef(TokStart, 2)); |
| default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); |
| } |
| |
| // TODO: Quoted identifiers (objc methods etc) |
| // local labels: [0-9][:] |
| // Forward/backward labels: [0-9][fb] |
| // Integers, fp constants, character constants. |
| } |
| } |