| //===-- Regex.cpp - Regular Expression matcher implementation -------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements a POSIX regular expression matcher. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "llvm/Support/Regex.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/StringRef.h" |
| #include "llvm/ADT/Twine.h" |
| #include <string> |
| |
| // Important this comes last because it defines "_REGEX_H_". At least on |
| // Darwin, if included before any header that (transitively) includes |
| // xlocale.h, this will cause trouble, because of missing regex-related types. |
| #include "regex_impl.h" |
| |
| using namespace llvm; |
| |
| Regex::Regex() : preg(nullptr), error(REG_BADPAT) {} |
| |
| Regex::Regex(StringRef regex, unsigned Flags) { |
| unsigned flags = 0; |
| preg = new llvm_regex(); |
| preg->re_endp = regex.end(); |
| if (Flags & IgnoreCase) |
| flags |= REG_ICASE; |
| if (Flags & Newline) |
| flags |= REG_NEWLINE; |
| if (!(Flags & BasicRegex)) |
| flags |= REG_EXTENDED; |
| error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); |
| } |
| |
| Regex::Regex(Regex &®ex) { |
| preg = regex.preg; |
| error = regex.error; |
| regex.preg = nullptr; |
| regex.error = REG_BADPAT; |
| } |
| |
| Regex::~Regex() { |
| if (preg) { |
| llvm_regfree(preg); |
| delete preg; |
| } |
| } |
| |
| namespace { |
| |
| /// Utility to convert a regex error code into a human-readable string. |
| void RegexErrorToString(int error, struct llvm_regex *preg, |
| std::string &Error) { |
| size_t len = llvm_regerror(error, preg, nullptr, 0); |
| |
| Error.resize(len - 1); |
| llvm_regerror(error, preg, &Error[0], len); |
| } |
| |
| } // namespace |
| |
| bool Regex::isValid(std::string &Error) const { |
| if (!error) |
| return true; |
| |
| RegexErrorToString(error, preg, Error); |
| return false; |
| } |
| |
| /// getNumMatches - In a valid regex, return the number of parenthesized |
| /// matches it contains. |
| unsigned Regex::getNumMatches() const { |
| return preg->re_nsub; |
| } |
| |
| bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches, |
| std::string *Error) const { |
| // Reset error, if given. |
| if (Error && !Error->empty()) |
| *Error = ""; |
| |
| // Check if the regex itself didn't successfully compile. |
| if (Error ? !isValid(*Error) : !isValid()) |
| return false; |
| |
| unsigned nmatch = Matches ? preg->re_nsub+1 : 0; |
| |
| // pmatch needs to have at least one element. |
| SmallVector<llvm_regmatch_t, 8> pm; |
| pm.resize(nmatch > 0 ? nmatch : 1); |
| pm[0].rm_so = 0; |
| pm[0].rm_eo = String.size(); |
| |
| int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND); |
| |
| // Failure to match is not an error, it's just a normal return value. |
| // Any other error code is considered abnormal, and is logged in the Error. |
| if (rc == REG_NOMATCH) |
| return false; |
| if (rc != 0) { |
| if (Error) |
| RegexErrorToString(error, preg, *Error); |
| return false; |
| } |
| |
| // There was a match. |
| |
| if (Matches) { // match position requested |
| Matches->clear(); |
| |
| for (unsigned i = 0; i != nmatch; ++i) { |
| if (pm[i].rm_so == -1) { |
| // this group didn't match |
| Matches->push_back(StringRef()); |
| continue; |
| } |
| assert(pm[i].rm_eo >= pm[i].rm_so); |
| Matches->push_back(StringRef(String.data()+pm[i].rm_so, |
| pm[i].rm_eo-pm[i].rm_so)); |
| } |
| } |
| |
| return true; |
| } |
| |
| std::string Regex::sub(StringRef Repl, StringRef String, |
| std::string *Error) const { |
| SmallVector<StringRef, 8> Matches; |
| |
| // Return the input if there was no match. |
| if (!match(String, &Matches, Error)) |
| return String; |
| |
| // Otherwise splice in the replacement string, starting with the prefix before |
| // the match. |
| std::string Res(String.begin(), Matches[0].begin()); |
| |
| // Then the replacement string, honoring possible substitutions. |
| while (!Repl.empty()) { |
| // Skip to the next escape. |
| std::pair<StringRef, StringRef> Split = Repl.split('\\'); |
| |
| // Add the skipped substring. |
| Res += Split.first; |
| |
| // Check for terminimation and trailing backslash. |
| if (Split.second.empty()) { |
| if (Repl.size() != Split.first.size() && |
| Error && Error->empty()) |
| *Error = "replacement string contained trailing backslash"; |
| break; |
| } |
| |
| // Otherwise update the replacement string and interpret escapes. |
| Repl = Split.second; |
| |
| // FIXME: We should have a StringExtras function for mapping C99 escapes. |
| switch (Repl[0]) { |
| // Treat all unrecognized characters as self-quoting. |
| default: |
| Res += Repl[0]; |
| Repl = Repl.substr(1); |
| break; |
| |
| // Single character escapes. |
| case 't': |
| Res += '\t'; |
| Repl = Repl.substr(1); |
| break; |
| case 'n': |
| Res += '\n'; |
| Repl = Repl.substr(1); |
| break; |
| |
| // Decimal escapes are backreferences. |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': { |
| // Extract the backreference number. |
| StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789")); |
| Repl = Repl.substr(Ref.size()); |
| |
| unsigned RefValue; |
| if (!Ref.getAsInteger(10, RefValue) && |
| RefValue < Matches.size()) |
| Res += Matches[RefValue]; |
| else if (Error && Error->empty()) |
| *Error = ("invalid backreference string '" + Twine(Ref) + "'").str(); |
| break; |
| } |
| } |
| } |
| |
| // And finally the suffix. |
| Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); |
| |
| return Res; |
| } |
| |
| // These are the special characters matched in functions like "p_ere_exp". |
| static const char RegexMetachars[] = "()^$|*+?.[]\\{}"; |
| |
| bool Regex::isLiteralERE(StringRef Str) { |
| // Check for regex metacharacters. This list was derived from our regex |
| // implementation in regcomp.c and double checked against the POSIX extended |
| // regular expression specification. |
| return Str.find_first_of(RegexMetachars) == StringRef::npos; |
| } |
| |
| std::string Regex::escape(StringRef String) { |
| std::string RegexStr; |
| for (unsigned i = 0, e = String.size(); i != e; ++i) { |
| if (strchr(RegexMetachars, String[i])) |
| RegexStr += '\\'; |
| RegexStr += String[i]; |
| } |
| |
| return RegexStr; |
| } |