|  | //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// | 
|  | // | 
|  | //                     The LLVM Compiler Infrastructure | 
|  | // | 
|  | // This file is distributed under the University of Illinois Open Source | 
|  | // License. See LICENSE.TXT for details. | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  | // | 
|  | // Implement the Lexer for TableGen. | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | #include "TGLexer.h" | 
|  | #include "llvm/ADT/StringSwitch.h" | 
|  | #include "llvm/ADT/Twine.h" | 
|  | #include "llvm/Config/config.h" // for strtoull()/strtoll() define | 
|  | #include "llvm/Support/Compiler.h" | 
|  | #include "llvm/Support/MemoryBuffer.h" | 
|  | #include "llvm/Support/SourceMgr.h" | 
|  | #include "llvm/TableGen/Error.h" | 
|  | #include <cctype> | 
|  | #include <cerrno> | 
|  | #include <cstdint> | 
|  | #include <cstdio> | 
|  | #include <cstdlib> | 
|  | #include <cstring> | 
|  |  | 
|  | using namespace llvm; | 
|  |  | 
|  | TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) { | 
|  | CurBuffer = SrcMgr.getMainFileID(); | 
|  | CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); | 
|  | CurPtr = CurBuf.begin(); | 
|  | TokStart = nullptr; | 
|  | } | 
|  |  | 
|  | SMLoc TGLexer::getLoc() const { | 
|  | return SMLoc::getFromPointer(TokStart); | 
|  | } | 
|  |  | 
|  | /// ReturnError - Set the error to the specified string at the specified | 
|  | /// location.  This is defined to always return tgtok::Error. | 
|  | tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { | 
|  | PrintError(Loc, Msg); | 
|  | return tgtok::Error; | 
|  | } | 
|  |  | 
|  | int TGLexer::getNextChar() { | 
|  | char CurChar = *CurPtr++; | 
|  | switch (CurChar) { | 
|  | default: | 
|  | return (unsigned char)CurChar; | 
|  | case 0: { | 
|  | // A nul character in the stream is either the end of the current buffer or | 
|  | // a random nul in the file.  Disambiguate that here. | 
|  | if (CurPtr-1 != CurBuf.end()) | 
|  | return 0;  // Just whitespace. | 
|  |  | 
|  | // If this is the end of an included file, pop the parent file off the | 
|  | // include stack. | 
|  | SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); | 
|  | if (ParentIncludeLoc != SMLoc()) { | 
|  | CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); | 
|  | CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); | 
|  | CurPtr = ParentIncludeLoc.getPointer(); | 
|  | return getNextChar(); | 
|  | } | 
|  |  | 
|  | // Otherwise, return end of file. | 
|  | --CurPtr;  // Another call to lex will return EOF again. | 
|  | return EOF; | 
|  | } | 
|  | case '\n': | 
|  | case '\r': | 
|  | // Handle the newline character by ignoring it and incrementing the line | 
|  | // count.  However, be careful about 'dos style' files with \n\r in them. | 
|  | // Only treat a \n\r or \r\n as a single line. | 
|  | if ((*CurPtr == '\n' || (*CurPtr == '\r')) && | 
|  | *CurPtr != CurChar) | 
|  | ++CurPtr;  // Eat the two char newline sequence. | 
|  | return '\n'; | 
|  | } | 
|  | } | 
|  |  | 
|  | int TGLexer::peekNextChar(int Index) { | 
|  | return *(CurPtr + Index); | 
|  | } | 
|  |  | 
|  | tgtok::TokKind TGLexer::LexToken() { | 
|  | TokStart = CurPtr; | 
|  | // This always consumes at least one character. | 
|  | int CurChar = getNextChar(); | 
|  |  | 
|  | switch (CurChar) { | 
|  | default: | 
|  | // Handle letters: [a-zA-Z_] | 
|  | if (isalpha(CurChar) || CurChar == '_') | 
|  | return LexIdentifier(); | 
|  |  | 
|  | // Unknown character, emit an error. | 
|  | return ReturnError(TokStart, "Unexpected character"); | 
|  | case EOF: return tgtok::Eof; | 
|  | case ':': return tgtok::colon; | 
|  | case ';': return tgtok::semi; | 
|  | case '.': return tgtok::period; | 
|  | case ',': return tgtok::comma; | 
|  | case '<': return tgtok::less; | 
|  | case '>': return tgtok::greater; | 
|  | case ']': return tgtok::r_square; | 
|  | case '{': return tgtok::l_brace; | 
|  | case '}': return tgtok::r_brace; | 
|  | case '(': return tgtok::l_paren; | 
|  | case ')': return tgtok::r_paren; | 
|  | case '=': return tgtok::equal; | 
|  | case '?': return tgtok::question; | 
|  | case '#': return tgtok::paste; | 
|  |  | 
|  | case 0: | 
|  | case ' ': | 
|  | case '\t': | 
|  | case '\n': | 
|  | case '\r': | 
|  | // Ignore whitespace. | 
|  | return LexToken(); | 
|  | case '/': | 
|  | // If this is the start of a // comment, skip until the end of the line or | 
|  | // the end of the buffer. | 
|  | if (*CurPtr == '/') | 
|  | SkipBCPLComment(); | 
|  | else if (*CurPtr == '*') { | 
|  | if (SkipCComment()) | 
|  | return tgtok::Error; | 
|  | } else // Otherwise, this is an error. | 
|  | return ReturnError(TokStart, "Unexpected character"); | 
|  | return LexToken(); | 
|  | case '-': case '+': | 
|  | case '0': case '1': case '2': case '3': case '4': case '5': case '6': | 
|  | case '7': case '8': case '9': { | 
|  | int NextChar = 0; | 
|  | if (isdigit(CurChar)) { | 
|  | // Allow identifiers to start with a number if it is followed by | 
|  | // an identifier.  This can happen with paste operations like | 
|  | // foo#8i. | 
|  | int i = 0; | 
|  | do { | 
|  | NextChar = peekNextChar(i++); | 
|  | } while (isdigit(NextChar)); | 
|  |  | 
|  | if (NextChar == 'x' || NextChar == 'b') { | 
|  | // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most | 
|  | // likely a number. | 
|  | int NextNextChar = peekNextChar(i); | 
|  | switch (NextNextChar) { | 
|  | default: | 
|  | break; | 
|  | case '0': case '1': | 
|  | if (NextChar == 'b') | 
|  | return LexNumber(); | 
|  | LLVM_FALLTHROUGH; | 
|  | case '2': case '3': case '4': case '5': | 
|  | case '6': case '7': case '8': case '9': | 
|  | case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': | 
|  | case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': | 
|  | if (NextChar == 'x') | 
|  | return LexNumber(); | 
|  | break; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | if (isalpha(NextChar) || NextChar == '_') | 
|  | return LexIdentifier(); | 
|  |  | 
|  | return LexNumber(); | 
|  | } | 
|  | case '"': return LexString(); | 
|  | case '$': return LexVarName(); | 
|  | case '[': return LexBracket(); | 
|  | case '!': return LexExclaim(); | 
|  | } | 
|  | } | 
|  |  | 
|  | /// LexString - Lex "[^"]*" | 
|  | tgtok::TokKind TGLexer::LexString() { | 
|  | const char *StrStart = CurPtr; | 
|  |  | 
|  | CurStrVal = ""; | 
|  |  | 
|  | while (*CurPtr != '"') { | 
|  | // If we hit the end of the buffer, report an error. | 
|  | if (*CurPtr == 0 && CurPtr == CurBuf.end()) | 
|  | return ReturnError(StrStart, "End of file in string literal"); | 
|  |  | 
|  | if (*CurPtr == '\n' || *CurPtr == '\r') | 
|  | return ReturnError(StrStart, "End of line in string literal"); | 
|  |  | 
|  | if (*CurPtr != '\\') { | 
|  | CurStrVal += *CurPtr++; | 
|  | continue; | 
|  | } | 
|  |  | 
|  | ++CurPtr; | 
|  |  | 
|  | switch (*CurPtr) { | 
|  | case '\\': case '\'': case '"': | 
|  | // These turn into their literal character. | 
|  | CurStrVal += *CurPtr++; | 
|  | break; | 
|  | case 't': | 
|  | CurStrVal += '\t'; | 
|  | ++CurPtr; | 
|  | break; | 
|  | case 'n': | 
|  | CurStrVal += '\n'; | 
|  | ++CurPtr; | 
|  | break; | 
|  |  | 
|  | case '\n': | 
|  | case '\r': | 
|  | return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); | 
|  |  | 
|  | // If we hit the end of the buffer, report an error. | 
|  | case '\0': | 
|  | if (CurPtr == CurBuf.end()) | 
|  | return ReturnError(StrStart, "End of file in string literal"); | 
|  | LLVM_FALLTHROUGH; | 
|  | default: | 
|  | return ReturnError(CurPtr, "invalid escape in string literal"); | 
|  | } | 
|  | } | 
|  |  | 
|  | ++CurPtr; | 
|  | return tgtok::StrVal; | 
|  | } | 
|  |  | 
|  | tgtok::TokKind TGLexer::LexVarName() { | 
|  | if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') | 
|  | return ReturnError(TokStart, "Invalid variable name"); | 
|  |  | 
|  | // Otherwise, we're ok, consume the rest of the characters. | 
|  | const char *VarNameStart = CurPtr++; | 
|  |  | 
|  | while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') | 
|  | ++CurPtr; | 
|  |  | 
|  | CurStrVal.assign(VarNameStart, CurPtr); | 
|  | return tgtok::VarName; | 
|  | } | 
|  |  | 
|  | tgtok::TokKind TGLexer::LexIdentifier() { | 
|  | // The first letter is [a-zA-Z_#]. | 
|  | const char *IdentStart = TokStart; | 
|  |  | 
|  | // Match the rest of the identifier regex: [0-9a-zA-Z_#]* | 
|  | while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') | 
|  | ++CurPtr; | 
|  |  | 
|  | // Check to see if this identifier is a keyword. | 
|  | StringRef Str(IdentStart, CurPtr-IdentStart); | 
|  |  | 
|  | if (Str == "include") { | 
|  | if (LexInclude()) return tgtok::Error; | 
|  | return Lex(); | 
|  | } | 
|  |  | 
|  | tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) | 
|  | .Case("int", tgtok::Int) | 
|  | .Case("bit", tgtok::Bit) | 
|  | .Case("bits", tgtok::Bits) | 
|  | .Case("string", tgtok::String) | 
|  | .Case("list", tgtok::List) | 
|  | .Case("code", tgtok::Code) | 
|  | .Case("dag", tgtok::Dag) | 
|  | .Case("class", tgtok::Class) | 
|  | .Case("def", tgtok::Def) | 
|  | .Case("foreach", tgtok::Foreach) | 
|  | .Case("defm", tgtok::Defm) | 
|  | .Case("defset", tgtok::Defset) | 
|  | .Case("multiclass", tgtok::MultiClass) | 
|  | .Case("field", tgtok::Field) | 
|  | .Case("let", tgtok::Let) | 
|  | .Case("in", tgtok::In) | 
|  | .Default(tgtok::Id); | 
|  |  | 
|  | if (Kind == tgtok::Id) | 
|  | CurStrVal.assign(Str.begin(), Str.end()); | 
|  | return Kind; | 
|  | } | 
|  |  | 
|  | /// LexInclude - We just read the "include" token.  Get the string token that | 
|  | /// comes next and enter the include. | 
|  | bool TGLexer::LexInclude() { | 
|  | // The token after the include must be a string. | 
|  | tgtok::TokKind Tok = LexToken(); | 
|  | if (Tok == tgtok::Error) return true; | 
|  | if (Tok != tgtok::StrVal) { | 
|  | PrintError(getLoc(), "Expected filename after include"); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | // Get the string. | 
|  | std::string Filename = CurStrVal; | 
|  | std::string IncludedFile; | 
|  |  | 
|  | CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), | 
|  | IncludedFile); | 
|  | if (!CurBuffer) { | 
|  | PrintError(getLoc(), "Could not find include file '" + Filename + "'"); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile); | 
|  | if (Found != Dependencies.end()) { | 
|  | PrintError(getLoc(), | 
|  | "File '" + IncludedFile + "' has already been included."); | 
|  | SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note, | 
|  | "previously included here"); | 
|  | return true; | 
|  | } | 
|  | Dependencies.insert(std::make_pair(IncludedFile, getLoc())); | 
|  | // Save the line number and lex buffer of the includer. | 
|  | CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); | 
|  | CurPtr = CurBuf.begin(); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | void TGLexer::SkipBCPLComment() { | 
|  | ++CurPtr;  // skip the second slash. | 
|  | while (true) { | 
|  | switch (*CurPtr) { | 
|  | case '\n': | 
|  | case '\r': | 
|  | return;  // Newline is end of comment. | 
|  | case 0: | 
|  | // If this is the end of the buffer, end the comment. | 
|  | if (CurPtr == CurBuf.end()) | 
|  | return; | 
|  | break; | 
|  | } | 
|  | // Otherwise, skip the character. | 
|  | ++CurPtr; | 
|  | } | 
|  | } | 
|  |  | 
|  | /// SkipCComment - This skips C-style /**/ comments.  The only difference from C | 
|  | /// is that we allow nesting. | 
|  | bool TGLexer::SkipCComment() { | 
|  | ++CurPtr;  // skip the star. | 
|  | unsigned CommentDepth = 1; | 
|  |  | 
|  | while (true) { | 
|  | int CurChar = getNextChar(); | 
|  | switch (CurChar) { | 
|  | case EOF: | 
|  | PrintError(TokStart, "Unterminated comment!"); | 
|  | return true; | 
|  | case '*': | 
|  | // End of the comment? | 
|  | if (CurPtr[0] != '/') break; | 
|  |  | 
|  | ++CurPtr;   // End the */. | 
|  | if (--CommentDepth == 0) | 
|  | return false; | 
|  | break; | 
|  | case '/': | 
|  | // Start of a nested comment? | 
|  | if (CurPtr[0] != '*') break; | 
|  | ++CurPtr; | 
|  | ++CommentDepth; | 
|  | break; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /// LexNumber - Lex: | 
|  | ///    [-+]?[0-9]+ | 
|  | ///    0x[0-9a-fA-F]+ | 
|  | ///    0b[01]+ | 
|  | tgtok::TokKind TGLexer::LexNumber() { | 
|  | if (CurPtr[-1] == '0') { | 
|  | if (CurPtr[0] == 'x') { | 
|  | ++CurPtr; | 
|  | const char *NumStart = CurPtr; | 
|  | while (isxdigit(CurPtr[0])) | 
|  | ++CurPtr; | 
|  |  | 
|  | // Requires at least one hex digit. | 
|  | if (CurPtr == NumStart) | 
|  | return ReturnError(TokStart, "Invalid hexadecimal number"); | 
|  |  | 
|  | errno = 0; | 
|  | CurIntVal = strtoll(NumStart, nullptr, 16); | 
|  | if (errno == EINVAL) | 
|  | return ReturnError(TokStart, "Invalid hexadecimal number"); | 
|  | if (errno == ERANGE) { | 
|  | errno = 0; | 
|  | CurIntVal = (int64_t)strtoull(NumStart, nullptr, 16); | 
|  | if (errno == EINVAL) | 
|  | return ReturnError(TokStart, "Invalid hexadecimal number"); | 
|  | if (errno == ERANGE) | 
|  | return ReturnError(TokStart, "Hexadecimal number out of range"); | 
|  | } | 
|  | return tgtok::IntVal; | 
|  | } else if (CurPtr[0] == 'b') { | 
|  | ++CurPtr; | 
|  | const char *NumStart = CurPtr; | 
|  | while (CurPtr[0] == '0' || CurPtr[0] == '1') | 
|  | ++CurPtr; | 
|  |  | 
|  | // Requires at least one binary digit. | 
|  | if (CurPtr == NumStart) | 
|  | return ReturnError(CurPtr-2, "Invalid binary number"); | 
|  | CurIntVal = strtoll(NumStart, nullptr, 2); | 
|  | return tgtok::BinaryIntVal; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Check for a sign without a digit. | 
|  | if (!isdigit(CurPtr[0])) { | 
|  | if (CurPtr[-1] == '-') | 
|  | return tgtok::minus; | 
|  | else if (CurPtr[-1] == '+') | 
|  | return tgtok::plus; | 
|  | } | 
|  |  | 
|  | while (isdigit(CurPtr[0])) | 
|  | ++CurPtr; | 
|  | CurIntVal = strtoll(TokStart, nullptr, 10); | 
|  | return tgtok::IntVal; | 
|  | } | 
|  |  | 
|  | /// LexBracket - We just read '['.  If this is a code block, return it, | 
|  | /// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' | 
|  | tgtok::TokKind TGLexer::LexBracket() { | 
|  | if (CurPtr[0] != '{') | 
|  | return tgtok::l_square; | 
|  | ++CurPtr; | 
|  | const char *CodeStart = CurPtr; | 
|  | while (true) { | 
|  | int Char = getNextChar(); | 
|  | if (Char == EOF) break; | 
|  |  | 
|  | if (Char != '}') continue; | 
|  |  | 
|  | Char = getNextChar(); | 
|  | if (Char == EOF) break; | 
|  | if (Char == ']') { | 
|  | CurStrVal.assign(CodeStart, CurPtr-2); | 
|  | return tgtok::CodeFragment; | 
|  | } | 
|  | } | 
|  |  | 
|  | return ReturnError(CodeStart-2, "Unterminated Code Block"); | 
|  | } | 
|  |  | 
|  | /// LexExclaim - Lex '!' and '![a-zA-Z]+'. | 
|  | tgtok::TokKind TGLexer::LexExclaim() { | 
|  | if (!isalpha(*CurPtr)) | 
|  | return ReturnError(CurPtr - 1, "Invalid \"!operator\""); | 
|  |  | 
|  | const char *Start = CurPtr++; | 
|  | while (isalpha(*CurPtr)) | 
|  | ++CurPtr; | 
|  |  | 
|  | // Check to see which operator this is. | 
|  | tgtok::TokKind Kind = | 
|  | StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) | 
|  | .Case("eq", tgtok::XEq) | 
|  | .Case("ne", tgtok::XNe) | 
|  | .Case("le", tgtok::XLe) | 
|  | .Case("lt", tgtok::XLt) | 
|  | .Case("ge", tgtok::XGe) | 
|  | .Case("gt", tgtok::XGt) | 
|  | .Case("if", tgtok::XIf) | 
|  | .Case("isa", tgtok::XIsA) | 
|  | .Case("head", tgtok::XHead) | 
|  | .Case("tail", tgtok::XTail) | 
|  | .Case("size", tgtok::XSize) | 
|  | .Case("con", tgtok::XConcat) | 
|  | .Case("dag", tgtok::XDag) | 
|  | .Case("add", tgtok::XADD) | 
|  | .Case("and", tgtok::XAND) | 
|  | .Case("or", tgtok::XOR) | 
|  | .Case("shl", tgtok::XSHL) | 
|  | .Case("sra", tgtok::XSRA) | 
|  | .Case("srl", tgtok::XSRL) | 
|  | .Case("cast", tgtok::XCast) | 
|  | .Case("empty", tgtok::XEmpty) | 
|  | .Case("subst", tgtok::XSubst) | 
|  | .Case("foldl", tgtok::XFoldl) | 
|  | .Case("foreach", tgtok::XForEach) | 
|  | .Case("listconcat", tgtok::XListConcat) | 
|  | .Case("strconcat", tgtok::XStrConcat) | 
|  | .Default(tgtok::Error); | 
|  |  | 
|  | return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); | 
|  | } |