| #!/usr/bin/env python |
| |
| import argparse |
| import email.mime.multipart |
| import email.mime.text |
| import logging |
| import os.path |
| import pickle |
| import re |
| import smtplib |
| import subprocess |
| import sys |
| from datetime import datetime, timedelta |
| from phabricator import Phabricator |
| |
| # Setting up a virtualenv to run this script can be done by running the |
| # following commands: |
| # $ virtualenv venv |
| # $ . ./venv/bin/activate |
| # $ pip install Phabricator |
| |
| GIT_REPO_METADATA = (("llvm", "https://llvm.org/git/llvm.git"), ) |
| |
| # The below PhabXXX classes represent objects as modelled by Phabricator. |
| # The classes can be serialized to disk, to try and make sure that we don't |
| # needlessly have to re-fetch lots of data from Phabricator, as that would |
| # make this script unusably slow. |
| |
| |
| class PhabObject: |
| OBJECT_KIND = None |
| |
| def __init__(self, id): |
| self.id = id |
| |
| |
| class PhabObjectCache: |
| def __init__(self, PhabObjectClass): |
| self.PhabObjectClass = PhabObjectClass |
| self.most_recent_info = None |
| self.oldest_info = None |
| self.id2PhabObjects = {} |
| |
| def get_name(self): |
| return self.PhabObjectClass.OBJECT_KIND + "sCache" |
| |
| def get(self, id): |
| if id not in self.id2PhabObjects: |
| self.id2PhabObjects[id] = self.PhabObjectClass(id) |
| return self.id2PhabObjects[id] |
| |
| def get_ids_in_cache(self): |
| return self.id2PhabObjects.keys() |
| |
| def get_objects(self): |
| return self.id2PhabObjects.values() |
| |
| DEFAULT_DIRECTORY = "PhabObjectCache" |
| |
| def _get_pickle_name(self, directory): |
| file_name = "Phab" + self.PhabObjectClass.OBJECT_KIND + "s.pickle" |
| return os.path.join(directory, file_name) |
| |
| def populate_cache_from_disk(self, directory=DEFAULT_DIRECTORY): |
| """ |
| FIXME: consider if serializing to JSON would bring interoperability |
| advantages over serializing to pickle. |
| """ |
| try: |
| f = open(self._get_pickle_name(directory), "rb") |
| except IOError as err: |
| print("Could not find cache. Error message: {0}. Continuing..." |
| .format(err)) |
| else: |
| with f: |
| try: |
| d = pickle.load(f) |
| self.__dict__.update(d) |
| except EOFError as err: |
| print("Cache seems to be corrupt. " + |
| "Not using cache. Error message: {0}".format(err)) |
| |
| def write_cache_to_disk(self, directory=DEFAULT_DIRECTORY): |
| if not os.path.exists(directory): |
| os.makedirs(directory) |
| with open(self._get_pickle_name(directory), "wb") as f: |
| pickle.dump(self.__dict__, f) |
| print("wrote cache to disk, most_recent_info= {0}".format( |
| datetime.fromtimestamp(self.most_recent_info) |
| if self.most_recent_info is not None else None)) |
| |
| |
| class PhabReview(PhabObject): |
| OBJECT_KIND = "Review" |
| |
| def __init__(self, id): |
| PhabObject.__init__(self, id) |
| |
| def update(self, title, dateCreated, dateModified, author): |
| self.title = title |
| self.dateCreated = dateCreated |
| self.dateModified = dateModified |
| self.author = author |
| |
| def setPhabDiffs(self, phabDiffs): |
| self.phabDiffs = phabDiffs |
| |
| |
| class PhabUser(PhabObject): |
| OBJECT_KIND = "User" |
| |
| def __init__(self, id): |
| PhabObject.__init__(self, id) |
| |
| def update(self, phid, realName): |
| self.phid = phid |
| self.realName = realName |
| |
| |
| class PhabHunk: |
| def __init__(self, rest_api_hunk): |
| self.oldOffset = int(rest_api_hunk["oldOffset"]) |
| self.oldLength = int(rest_api_hunk["oldLength"]) |
| # self.actual_lines_changed_offset will contain the offsets of the |
| # lines that were changed in this hunk. |
| self.actual_lines_changed_offset = [] |
| offset = self.oldOffset |
| inHunk = False |
| hunkStart = -1 |
| contextLines = 3 |
| for line in rest_api_hunk["corpus"].split("\n"): |
| if line.startswith("+"): |
| # line is a new line that got introduced in this patch. |
| # Do not record it as a changed line. |
| if inHunk is False: |
| inHunk = True |
| hunkStart = max(self.oldOffset, offset - contextLines) |
| continue |
| if line.startswith("-"): |
| # line was changed or removed from the older version of the |
| # code. Record it as a changed line. |
| if inHunk is False: |
| inHunk = True |
| hunkStart = max(self.oldOffset, offset - contextLines) |
| offset += 1 |
| continue |
| # line is a context line. |
| if inHunk is True: |
| inHunk = False |
| hunkEnd = offset + contextLines |
| self.actual_lines_changed_offset.append((hunkStart, hunkEnd)) |
| offset += 1 |
| if inHunk is True: |
| hunkEnd = offset + contextLines |
| self.actual_lines_changed_offset.append((hunkStart, hunkEnd)) |
| |
| # The above algorithm could result in adjacent or overlapping ranges |
| # being recorded into self.actual_lines_changed_offset. |
| # Merge the adjacent and overlapping ranges in there: |
| t = [] |
| lastRange = None |
| for start, end in self.actual_lines_changed_offset + \ |
| [(sys.maxsize, sys.maxsize)]: |
| if lastRange is None: |
| lastRange = (start, end) |
| else: |
| if lastRange[1] >= start: |
| lastRange = (lastRange[0], end) |
| else: |
| t.append(lastRange) |
| lastRange = (start, end) |
| self.actual_lines_changed_offset = t |
| |
| |
| class PhabChange: |
| def __init__(self, rest_api_change): |
| self.oldPath = rest_api_change["oldPath"] |
| self.hunks = [PhabHunk(h) for h in rest_api_change["hunks"]] |
| |
| |
| class PhabDiff(PhabObject): |
| OBJECT_KIND = "Diff" |
| |
| def __init__(self, id): |
| PhabObject.__init__(self, id) |
| |
| def update(self, rest_api_results): |
| self.revisionID = rest_api_results["revisionID"] |
| self.dateModified = int(rest_api_results["dateModified"]) |
| self.dateCreated = int(rest_api_results["dateCreated"]) |
| self.changes = [PhabChange(c) for c in rest_api_results["changes"]] |
| |
| |
| class ReviewsCache(PhabObjectCache): |
| def __init__(self): |
| PhabObjectCache.__init__(self, PhabReview) |
| |
| |
| class UsersCache(PhabObjectCache): |
| def __init__(self): |
| PhabObjectCache.__init__(self, PhabUser) |
| |
| |
| reviews_cache = ReviewsCache() |
| users_cache = UsersCache() |
| |
| |
| def init_phab_connection(): |
| phab = Phabricator() |
| phab.update_interfaces() |
| return phab |
| |
| |
| def update_cached_info(phab, cache, phab_query, order, record_results, |
| max_nr_entries_per_fetch, max_nr_days_to_cache): |
| q = phab |
| LIMIT = max_nr_entries_per_fetch |
| for query_step in phab_query: |
| q = getattr(q, query_step) |
| results = q(order=order, limit=LIMIT) |
| most_recent_info, oldest_info = record_results(cache, results, phab) |
| oldest_info_to_fetch = datetime.fromtimestamp(most_recent_info) - \ |
| timedelta(days=max_nr_days_to_cache) |
| most_recent_info_overall = most_recent_info |
| cache.write_cache_to_disk() |
| after = results["cursor"]["after"] |
| print("after: {0!r}".format(after)) |
| print("most_recent_info: {0}".format( |
| datetime.fromtimestamp(most_recent_info))) |
| while (after is not None |
| and datetime.fromtimestamp(oldest_info) > oldest_info_to_fetch): |
| need_more_older_data = \ |
| (cache.oldest_info is None or |
| datetime.fromtimestamp(cache.oldest_info) > oldest_info_to_fetch) |
| print(("need_more_older_data={0} cache.oldest_info={1} " + |
| "oldest_info_to_fetch={2}").format( |
| need_more_older_data, |
| datetime.fromtimestamp(cache.oldest_info) |
| if cache.oldest_info is not None else None, |
| oldest_info_to_fetch)) |
| need_more_newer_data = \ |
| (cache.most_recent_info is None or |
| cache.most_recent_info < most_recent_info) |
| print(("need_more_newer_data={0} cache.most_recent_info={1} " + |
| "most_recent_info={2}") |
| .format(need_more_newer_data, cache.most_recent_info, |
| most_recent_info)) |
| if not need_more_older_data and not need_more_newer_data: |
| break |
| results = q(order=order, after=after, limit=LIMIT) |
| most_recent_info, oldest_info = record_results(cache, results, phab) |
| after = results["cursor"]["after"] |
| print("after: {0!r}".format(after)) |
| print("most_recent_info: {0}".format( |
| datetime.fromtimestamp(most_recent_info))) |
| cache.write_cache_to_disk() |
| cache.most_recent_info = most_recent_info_overall |
| if after is None: |
| # We did fetch all records. Mark the cache to contain all info since |
| # the start of time. |
| oldest_info = 0 |
| cache.oldest_info = oldest_info |
| cache.write_cache_to_disk() |
| |
| |
| def record_reviews(cache, reviews, phab): |
| most_recent_info = None |
| oldest_info = None |
| for reviewInfo in reviews["data"]: |
| if reviewInfo["type"] != "DREV": |
| continue |
| id = reviewInfo["id"] |
| # phid = reviewInfo["phid"] |
| dateModified = int(reviewInfo["fields"]["dateModified"]) |
| dateCreated = int(reviewInfo["fields"]["dateCreated"]) |
| title = reviewInfo["fields"]["title"] |
| author = reviewInfo["fields"]["authorPHID"] |
| phabReview = cache.get(id) |
| if "dateModified" not in phabReview.__dict__ or \ |
| dateModified > phabReview.dateModified: |
| diff_results = phab.differential.querydiffs(revisionIDs=[id]) |
| diff_ids = sorted(diff_results.keys()) |
| phabDiffs = [] |
| for diff_id in diff_ids: |
| diffInfo = diff_results[diff_id] |
| d = PhabDiff(diff_id) |
| d.update(diffInfo) |
| phabDiffs.append(d) |
| phabReview.update(title, dateCreated, dateModified, author) |
| phabReview.setPhabDiffs(phabDiffs) |
| print("Updated D{0} modified on {1} ({2} diffs)".format( |
| id, datetime.fromtimestamp(dateModified), len(phabDiffs))) |
| |
| if most_recent_info is None: |
| most_recent_info = dateModified |
| elif most_recent_info < dateModified: |
| most_recent_info = dateModified |
| |
| if oldest_info is None: |
| oldest_info = dateModified |
| elif oldest_info > dateModified: |
| oldest_info = dateModified |
| return most_recent_info, oldest_info |
| |
| |
| def record_users(cache, users, phab): |
| most_recent_info = None |
| oldest_info = None |
| for info in users["data"]: |
| if info["type"] != "USER": |
| continue |
| id = info["id"] |
| phid = info["phid"] |
| dateModified = int(info["fields"]["dateModified"]) |
| # dateCreated = int(info["fields"]["dateCreated"]) |
| realName = info["fields"]["realName"] |
| phabUser = cache.get(id) |
| phabUser.update(phid, realName) |
| if most_recent_info is None: |
| most_recent_info = dateModified |
| elif most_recent_info < dateModified: |
| most_recent_info = dateModified |
| if oldest_info is None: |
| oldest_info = dateModified |
| elif oldest_info > dateModified: |
| oldest_info = dateModified |
| return most_recent_info, oldest_info |
| |
| |
| PHABCACHESINFO = ((reviews_cache, ("differential", "revision", "search"), |
| "updated", record_reviews, 5, 7), |
| (users_cache, ("user", "search"), "newest", record_users, |
| 100, 1000)) |
| |
| |
| def load_cache(): |
| for cache, phab_query, order, record_results, _, _ in PHABCACHESINFO: |
| cache.populate_cache_from_disk() |
| print("Loaded {0} nr entries: {1}".format( |
| cache.get_name(), len(cache.get_ids_in_cache()))) |
| print("Loaded {0} has most recent info: {1}".format( |
| cache.get_name(), |
| datetime.fromtimestamp(cache.most_recent_info) |
| if cache.most_recent_info is not None else None)) |
| |
| |
| def update_cache(phab): |
| load_cache() |
| for cache, phab_query, order, record_results, max_nr_entries_per_fetch, \ |
| max_nr_days_to_cache in PHABCACHESINFO: |
| update_cached_info(phab, cache, phab_query, order, record_results, |
| max_nr_entries_per_fetch, max_nr_days_to_cache) |
| ids_in_cache = cache.get_ids_in_cache() |
| print("{0} objects in {1}".format(len(ids_in_cache), cache.get_name())) |
| cache.write_cache_to_disk() |
| |
| |
| def get_most_recent_reviews(days): |
| newest_reviews = sorted( |
| reviews_cache.get_objects(), key=lambda r: -r.dateModified) |
| if len(newest_reviews) == 0: |
| return newest_reviews |
| most_recent_review_time = \ |
| datetime.fromtimestamp(newest_reviews[0].dateModified) |
| cut_off_date = most_recent_review_time - timedelta(days=days) |
| result = [] |
| for review in newest_reviews: |
| if datetime.fromtimestamp(review.dateModified) < cut_off_date: |
| return result |
| result.append(review) |
| return result |
| |
| |
| # All of the above code is about fetching data from Phabricator and caching it |
| # on local disk. The below code contains the actual "business logic" for this |
| # script. |
| |
| _userphid2realname = None |
| |
| |
| def get_real_name_from_author(user_phid): |
| global _userphid2realname |
| if _userphid2realname is None: |
| _userphid2realname = {} |
| for user in users_cache.get_objects(): |
| _userphid2realname[user.phid] = user.realName |
| return _userphid2realname.get(user_phid, "unknown") |
| |
| |
| def print_most_recent_reviews(phab, days, filter_reviewers): |
| msgs = [] |
| |
| def add_msg(msg): |
| msgs.append(msg) |
| print(msg) |
| |
| newest_reviews = get_most_recent_reviews(days) |
| add_msg(u"These are the reviews that look interesting to be reviewed. " + |
| u"The report below has 2 sections. The first " + |
| u"section is organized per review; the second section is organized " |
| + u"per potential reviewer.\n") |
| oldest_review = newest_reviews[-1] if len(newest_reviews) > 0 else None |
| oldest_datetime = \ |
| datetime.fromtimestamp(oldest_review.dateModified) \ |
| if oldest_review else None |
| add_msg((u"The report below is based on analyzing the reviews that got " + |
| u"touched in the past {0} days (since {1}). " + |
| u"The script found {2} such reviews.\n").format( |
| days, oldest_datetime, len(newest_reviews))) |
| reviewer2reviews_and_scores = {} |
| for i, review in enumerate(newest_reviews): |
| matched_reviewers = find_reviewers_for_review(review) |
| matched_reviewers = filter_reviewers(matched_reviewers) |
| if len(matched_reviewers) == 0: |
| continue |
| add_msg((u"{0:>3}. https://reviews.llvm.org/D{1} by {2}\n {3}\n" + |
| u" Last updated on {4}").format( |
| i, review.id, |
| get_real_name_from_author(review.author), review.title, |
| datetime.fromtimestamp(review.dateModified))) |
| for reviewer, scores in matched_reviewers: |
| add_msg(u" potential reviewer {0}, score {1}".format( |
| reviewer, |
| "(" + "/".join(["{0:.1f}%".format(s) for s in scores]) + ")")) |
| if reviewer not in reviewer2reviews_and_scores: |
| reviewer2reviews_and_scores[reviewer] = [] |
| reviewer2reviews_and_scores[reviewer].append((review, scores)) |
| |
| # Print out a summary per reviewer. |
| for reviewer in sorted(reviewer2reviews_and_scores.keys()): |
| reviews_and_scores = reviewer2reviews_and_scores[reviewer] |
| reviews_and_scores.sort(key=lambda rs: rs[1], reverse=True) |
| add_msg(u"\n\nSUMMARY FOR {0} (found {1} reviews):".format( |
| reviewer, len(reviews_and_scores))) |
| for review, scores in reviews_and_scores: |
| add_msg(u"[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format( |
| "/".join(["{0:.1f}%".format(s) for s in scores]), review.id, |
| review.title, get_real_name_from_author(review.author))) |
| return "\n".join(msgs) |
| |
| |
| def get_git_cmd_output(cmd): |
| output = None |
| try: |
| logging.debug(cmd) |
| output = subprocess.check_output( |
| cmd, shell=True, stderr=subprocess.STDOUT) |
| except subprocess.CalledProcessError as e: |
| logging.debug(str(e)) |
| if output is None: |
| return None |
| return output.decode("utf-8", errors='ignore') |
| |
| |
| reAuthorMail = re.compile("^author-mail <([^>]*)>.*$") |
| |
| |
| def parse_blame_output_line_porcelain(blame_output): |
| email2nr_occurences = {} |
| if blame_output is None: |
| return email2nr_occurences |
| for line in blame_output.split('\n'): |
| m = reAuthorMail.match(line) |
| if m: |
| author_email_address = m.group(1) |
| if author_email_address not in email2nr_occurences: |
| email2nr_occurences[author_email_address] = 1 |
| else: |
| email2nr_occurences[author_email_address] += 1 |
| return email2nr_occurences |
| |
| |
| def find_reviewers_for_diff_heuristic(diff): |
| # Heuristic 1: assume good reviewers are the ones that touched the same |
| # lines before as this patch is touching. |
| # Heuristic 2: assume good reviewers are the ones that touched the same |
| # files before as this patch is touching. |
| reviewers2nr_lines_touched = {} |
| reviewers2nr_files_touched = {} |
| # Assume last revision before diff was modified is the revision the diff |
| # applies to. |
| git_repo = "git_repos/llvm" |
| cmd = 'git -C {0} rev-list -n 1 --before="{1}" master'.format( |
| git_repo, |
| datetime.fromtimestamp( |
| diff.dateModified).strftime("%Y-%m-%d %H:%M:%s")) |
| base_revision = get_git_cmd_output(cmd).strip() |
| logging.debug("Base revision={0}".format(base_revision)) |
| for change in diff.changes: |
| path = change.oldPath |
| # Compute heuristic 1: look at context of patch lines. |
| for hunk in change.hunks: |
| for start_line, end_line in hunk.actual_lines_changed_offset: |
| # Collect git blame results for authors in those ranges. |
| cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e " + |
| "-w --line-porcelain -L {1},{2} {3} -- {4}").format( |
| git_repo, start_line, end_line, base_revision, path) |
| blame_output = get_git_cmd_output(cmd) |
| for reviewer, nr_occurences in \ |
| parse_blame_output_line_porcelain(blame_output).items(): |
| if reviewer not in reviewers2nr_lines_touched: |
| reviewers2nr_lines_touched[reviewer] = 0 |
| reviewers2nr_lines_touched[reviewer] += nr_occurences |
| # Compute heuristic 2: don't look at context, just at files touched. |
| # Collect git blame results for authors in those ranges. |
| cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " + |
| "--line-porcelain {1} -- {2}").format(git_repo, base_revision, |
| path) |
| blame_output = get_git_cmd_output(cmd) |
| for reviewer, nr_occurences in parse_blame_output_line_porcelain( |
| blame_output).items(): |
| if reviewer not in reviewers2nr_files_touched: |
| reviewers2nr_files_touched[reviewer] = 0 |
| reviewers2nr_files_touched[reviewer] += 1 |
| |
| # Compute "match scores" |
| total_nr_lines = sum(reviewers2nr_lines_touched.values()) |
| total_nr_files = len(diff.changes) |
| reviewers_matchscores = \ |
| [(reviewer, |
| (reviewers2nr_lines_touched.get(reviewer, 0)*100.0/total_nr_lines |
| if total_nr_lines != 0 else 0, |
| reviewers2nr_files_touched[reviewer]*100.0/total_nr_files |
| if total_nr_files != 0 else 0)) |
| for reviewer, nr_lines |
| in reviewers2nr_files_touched.items()] |
| reviewers_matchscores.sort(key=lambda i: i[1], reverse=True) |
| return reviewers_matchscores |
| |
| |
| def find_reviewers_for_review(review): |
| # Process the newest diff first. |
| diffs = sorted( |
| review.phabDiffs, key=lambda d: d.dateModified, reverse=True) |
| if len(diffs) == 0: |
| return |
| diff = diffs[0] |
| matched_reviewers = find_reviewers_for_diff_heuristic(diff) |
| # Show progress, as this is a slow operation: |
| sys.stdout.write('.') |
| sys.stdout.flush() |
| logging.debug(u"matched_reviewers: {0}".format(matched_reviewers)) |
| return matched_reviewers |
| |
| |
| def update_git_repos(): |
| git_repos_directory = "git_repos" |
| for name, url in GIT_REPO_METADATA: |
| dirname = os.path.join(git_repos_directory, name) |
| if not os.path.exists(dirname): |
| cmd = "git clone {0} {1}".format(url, dirname) |
| output = get_git_cmd_output(cmd) |
| cmd = "git -C {0} pull --rebase".format(dirname) |
| output = get_git_cmd_output(cmd) |
| |
| |
| def send_emails(email_addresses, sender, msg): |
| s = smtplib.SMTP() |
| s.connect() |
| for email_address in email_addresses: |
| email_msg = email.mime.multipart.MIMEMultipart() |
| email_msg['From'] = sender |
| email_msg['To'] = email_address |
| email_msg['Subject'] = 'LLVM patches you may be able to review.' |
| email_msg.attach(email.mime.text.MIMEText(msg.encode('utf-8'), 'plain')) |
| # python 3.x: s.send_message(email_msg) |
| s.sendmail(email_msg['From'], email_msg['To'], email_msg.as_string()) |
| s.quit() |
| |
| |
| def filter_reviewers_to_report_for(people_to_look_for): |
| # The below is just an example filter, to only report potential reviews |
| # to do for the people that will receive the report email. |
| return lambda potential_reviewers: [r for r in potential_reviewers |
| if r[0] in people_to_look_for] |
| |
| |
| def main(): |
| parser = argparse.ArgumentParser( |
| description='Match open reviews to potential reviewers.') |
| parser.add_argument( |
| '--no-update-cache', |
| dest='update_cache', |
| action='store_false', |
| default=True, |
| help='Do not update cached Phabricator objects') |
| parser.add_argument( |
| '--email-report', |
| dest='email_report', |
| nargs='*', |
| default="", |
| help="A email addresses to send the report to.") |
| parser.add_argument( |
| '--sender', |
| dest='sender', |
| default="", |
| help="The email address to use in 'From' on messages emailed out.") |
| parser.add_argument( |
| '--email-addresses', |
| dest='email_addresses', |
| nargs='*', |
| help="The email addresses (as known by LLVM git) of " + |
| "the people to look for reviews for.") |
| parser.add_argument('--verbose', '-v', action='count') |
| |
| args = parser.parse_args() |
| |
| if args.verbose >= 1: |
| logging.basicConfig(level=logging.DEBUG) |
| |
| people_to_look_for = [e.decode('utf-8') for e in args.email_addresses] |
| logging.debug("Will look for reviews that following contributors could " + |
| "review: {}".format(people_to_look_for)) |
| logging.debug("Will email a report to: {}".format(args.email_report)) |
| |
| phab = init_phab_connection() |
| |
| if args.update_cache: |
| update_cache(phab) |
| |
| load_cache() |
| update_git_repos() |
| msg = print_most_recent_reviews( |
| phab, |
| days=1, |
| filter_reviewers=filter_reviewers_to_report_for(people_to_look_for)) |
| |
| if args.email_report != []: |
| send_emails(args.email_report, args.sender, msg) |
| |
| |
| if __name__ == "__main__": |
| main() |