Regres: Further optimizations for coverage

Compress common spans into groups and index these.
Reduces the full coverage json from ~500MB to just under 100MB.

Use zlib compression on the final file instead of a zip. The compression algorithm is the same, and produces roughly the same sized output, but a raw zlib file is much quicker to decompress with pako instead of jszip.

Bug: b/152192800
Change-Id: Id444ddf4027b6ace03570719f159ae14e481cd37
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/43249
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
Tested-by: Ben Clayton <bclayton@google.com>
diff --git a/tests/regres/cmd/regres/main.go b/tests/regres/cmd/regres/main.go
index a971dae..219addd 100644
--- a/tests/regres/cmd/regres/main.go
+++ b/tests/regres/cmd/regres/main.go
@@ -26,14 +26,12 @@
 package main
 
 import (
-	"archive/zip"
 	"crypto/sha1"
 	"encoding/hex"
 	"encoding/json"
 	"errors"
 	"flag"
 	"fmt"
-	"io"
 	"io/ioutil"
 	"log"
 	"math"
@@ -759,33 +757,23 @@
 	dir := filepath.Join(r.cacheRoot, "coverage")
 	defer os.RemoveAll(dir)
 	if err := git.CheckoutRemoteBranch(dir, url, coverageBranch); err != nil {
-		return fmt.Errorf("Failed to checkout gh-pages branch: %v", err)
+		return cause.Wrap(err, "Failed to checkout gh-pages branch")
 	}
 
-	filePath := filepath.Join(dir, "coverage.zip")
+	filePath := filepath.Join(dir, "coverage.dat")
 	file, err := os.Create(filePath)
 	if err != nil {
-		return fmt.Errorf("Failed to create file '%s': %v", filePath, err)
+		return cause.Wrap(err, "Failed to create file '%s'", filePath)
 	}
 	defer file.Close()
 
-	coverage := cov.JSON(revision.String())
-
-	zw := zip.NewWriter(file)
-	zfw, err := zw.Create("coverage.json")
-	if err != nil {
-		return fmt.Errorf("Failed to create 'coverage.json' file in zip: %v", err)
-	}
-	if _, err := io.Copy(zfw, strings.NewReader(coverage)); err != nil {
-		return fmt.Errorf("Failed to compress coverage datas: %v", err)
-	}
-	if err := zw.Close(); err != nil {
-		return fmt.Errorf("Failed to close zip file: %v", err)
+	if err := cov.Encode(revision.String(), file); err != nil {
+		return cause.Wrap(err, "Failed to encode coverage")
 	}
 	file.Close()
 
 	if err := git.Add(dir, filePath); err != nil {
-		return fmt.Errorf("Failed to git add '%s': %v", filePath, err)
+		return cause.Wrap(err, "Failed to git add '%s'", filePath)
 	}
 
 	shortHash := revision.String()[:8]
@@ -795,18 +783,17 @@
 		Email: r.gerritEmail,
 	})
 	if err != nil {
-		return fmt.Errorf("Failed to 'git commit': %v", err)
+		return cause.Wrap(err, "Failed to git commit")
 	}
 
 	if !r.dryRun {
 		err = git.Push(dir, url, coverageBranch, coverageBranch, git.PushFlags{})
 		if err != nil {
-			return fmt.Errorf("Failed to 'git push': %v", err)
+			return cause.Wrap(err, "Failed to 'git push'")
 		}
+		log.Printf("Coverage for %v pushed to Github\n", shortHash)
 	}
 
-	log.Printf("Coverage for %v pushed to Github\n", shortHash)
-
 	return nil
 }
 
diff --git a/tests/regres/cmd/run_testlist/main.go b/tests/regres/cmd/run_testlist/main.go
index 2414447..c378cac 100644
--- a/tests/regres/cmd/run_testlist/main.go
+++ b/tests/regres/cmd/run_testlist/main.go
@@ -36,6 +36,7 @@
 	"strings"
 	"time"
 
+	"../../cause"
 	"../../cov"
 	"../../deqp"
 	"../../llvm"
@@ -132,8 +133,12 @@
 	}
 
 	if *genCoverage {
-		if err := ioutil.WriteFile("coverage.json", []byte(res.Coverage.JSON("master")), 0666); err != nil {
-			return err
+		f, err := os.Create("coverage.dat")
+		if err != nil {
+			return cause.Wrap(err, "Couldn't open coverage.dat file")
+		}
+		if err := res.Coverage.Encode("master", f); err != nil {
+			return cause.Wrap(err, "Couldn't encode coverage data")
 		}
 	}
 
diff --git a/tests/regres/cov/coverage.go b/tests/regres/cov/coverage.go
index 4ea3474..f34ac67 100644
--- a/tests/regres/cov/coverage.go
+++ b/tests/regres/cov/coverage.go
@@ -19,10 +19,12 @@
 import (
 	"bufio"
 	"bytes"
+	"compress/zlib"
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
 	"io"
+	"log"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -30,6 +32,7 @@
 	"sort"
 	"strconv"
 	"strings"
+	"sync"
 
 	"../cause"
 	"../llvm"
@@ -252,6 +255,18 @@
 // Path is a tree node path formed from a list of strings
 type Path []string
 
+type treeFile struct {
+	tcm        TestCoverageMap
+	spangroups map[SpanGroupID]SpanGroup
+}
+
+func newTreeFile() *treeFile {
+	return &treeFile{
+		tcm:        TestCoverageMap{},
+		spangroups: map[SpanGroupID]SpanGroup{},
+	}
+}
+
 // Tree represents source code coverage across a tree of different processes.
 // Each tree node is addressed by a Path.
 type Tree struct {
@@ -259,7 +274,7 @@
 	strings     Strings
 	spans       map[Span]SpanID
 	testRoot    Test
-	files       map[string]TestCoverageMap
+	files       map[string]*treeFile
 }
 
 func (t *Tree) init() {
@@ -267,7 +282,7 @@
 		t.strings.m = map[string]StringID{}
 		t.spans = map[Span]SpanID{}
 		t.testRoot = newTest()
-		t.files = map[string]TestCoverageMap{}
+		t.files = map[string]*treeFile{}
 		t.initialized = true
 	}
 }
@@ -290,9 +305,9 @@
 	return out
 }
 
-// File returns the TestCoverageMap for the given file
-func (t *Tree) File(path string) TestCoverageMap {
-	return t.files[path]
+// FileCoverage returns the TestCoverageMap for the given file
+func (t *Tree) FileCoverage(path string) TestCoverageMap {
+	return t.files[path].tcm
 }
 
 // Tests returns the root test
@@ -334,17 +349,17 @@
 	// For each file with coverage...
 	for _, file := range cov.Files {
 		// Lookup or create the file's test coverage map
-		tcm, ok := t.files[file.Path]
+		tf, ok := t.files[file.Path]
 		if !ok {
-			tcm = TestCoverageMap{}
-			t.files[file.Path] = tcm
+			tf = newTreeFile()
+			t.files[file.Path] = tf
 		}
 
 		// Add all the spans to the map, get the span ids
 		spans := t.addSpans(file.Spans)
 
 		// Starting from the test root, walk down the test tree.
-		test := t.testRoot
+		tcm, test := tf.tcm, t.testRoot
 		parent := (*TestCoverage)(nil)
 		for _, indexedTest := range tests {
 			if indexedTest.created {
@@ -471,6 +486,7 @@
 // child test.
 type TestCoverage struct {
 	Spans    SpanSet
+	Group    *SpanGroupID
 	Children TestCoverageMap
 }
 
@@ -488,6 +504,13 @@
 // TestCoverageMap is a map of TestIndex to *TestCoverage.
 type TestCoverageMap map[TestIndex]*TestCoverage
 
+func (tcm TestCoverageMap) traverse(cb func(*TestCoverage)) {
+	for _, tc := range tcm {
+		cb(tc)
+		tc.Children.traverse(cb)
+	}
+}
+
 func (tcm TestCoverageMap) String(t *Test, s Strings) string {
 	sb := strings.Builder{}
 	for _, n := range t.byName(s) {
@@ -547,6 +570,15 @@
 	return sb.String()
 }
 
+func (s SpanSet) containsAll(rhs SpanSet) bool {
+	for span := range rhs {
+		if _, found := s[span]; !found {
+			return false
+		}
+	}
+	return true
+}
+
 func (s SpanSet) sub(rhs SpanSet) SpanSet {
 	out := make(SpanSet, len(s))
 	for span := range s {
@@ -568,10 +600,130 @@
 	return out
 }
 
+// SpanGroupID is an identifier of a SpanGroup.
+type SpanGroupID int
+
+// SpanGroup holds a number of spans, potentially extending from another
+// SpanGroup.
+type SpanGroup struct {
+	spans  SpanSet
+	extend *SpanGroupID
+}
+
+func newSpanGroup() SpanGroup {
+	return SpanGroup{spans: SpanSet{}}
+}
+
 func indent(s string) string {
 	return strings.TrimSuffix(strings.ReplaceAll(s, "\n", "\n  "), "  ")
 }
 
+// Optimize optimizes the Tree by de-duplicating common spans into a tree of
+// SpanGroups.
+func (t *Tree) Optimize() {
+	log.Printf("Optimizing coverage tree...")
+
+	// Start by gathering all of the unique spansets
+	wg := sync.WaitGroup{}
+	wg.Add(len(t.files))
+	for _, file := range t.files {
+		file := file
+		go func() {
+			defer wg.Done()
+			file.optimize()
+		}()
+	}
+	wg.Wait()
+}
+
+func (f *treeFile) optimize() {
+	const minSpansInGroup = 2
+
+	type spansetKey string
+	spansetMap := map[spansetKey]SpanSet{}
+
+	f.tcm.traverse(func(tc *TestCoverage) {
+		if len(tc.Spans) >= minSpansInGroup {
+			key := spansetKey(tc.Spans.String())
+			if _, ok := spansetMap[key]; !ok {
+				spansetMap[key] = tc.Spans
+			}
+		}
+	})
+
+	if len(spansetMap) == 0 {
+		return
+	}
+
+	// Sort by number of spans in each sets.
+	type spansetInfo struct {
+		key spansetKey
+		set SpanSet // fully expanded set
+		grp SpanGroup
+		id  SpanGroupID
+	}
+	spansets := make([]*spansetInfo, 0, len(spansetMap))
+	for key, set := range spansetMap {
+		spansets = append(spansets, &spansetInfo{
+			key: key,
+			set: set,
+			grp: SpanGroup{spans: set},
+			id:  SpanGroupID(len(spansets)),
+		})
+	}
+	sort.Slice(spansets, func(i, j int) bool { return len(spansets[i].set) > len(spansets[j].set) })
+
+	// Loop over the spanGroups starting from the largest, and try to fold them
+	// into the larger sets.
+	// This is O(n^2) complexity.
+nextSpan:
+	for i, a := range spansets[:len(spansets)-1] {
+		for _, b := range spansets[i+1:] {
+			if len(a.set) > len(b.set) && a.set.containsAll(b.set) {
+				extend := b.id // Do not take address of iterator!
+				a.grp.spans = a.set.sub(b.set)
+				a.grp.extend = &extend
+				continue nextSpan
+			}
+		}
+	}
+
+	// Rebuild a map of spansetKey to SpanGroup
+	spangroupMap := make(map[spansetKey]*spansetInfo, len(spansets))
+	for _, s := range spansets {
+		spangroupMap[s.key] = s
+	}
+
+	// Store the groups in the tree
+	f.spangroups = make(map[SpanGroupID]SpanGroup, len(spansets))
+	for _, s := range spansets {
+		f.spangroups[s.id] = s.grp
+	}
+
+	// Update all the uses.
+	f.tcm.traverse(func(tc *TestCoverage) {
+		key := spansetKey(tc.Spans.String())
+		if g, ok := spangroupMap[key]; ok {
+			tc.Spans = nil
+			tc.Group = &g.id
+		}
+	})
+}
+
+// Encode zlib encodes the JSON coverage tree to w.
+func (t *Tree) Encode(revision string, w io.Writer) error {
+	t.Optimize()
+
+	zw := zlib.NewWriter(w)
+
+	_, err := zw.Write([]byte(t.JSON(revision)))
+	if err != nil {
+		return err
+	}
+
+	return zw.Close()
+}
+
 // JSON returns the full test tree serialized to JSON.
 func (t *Tree) JSON(revision string) string {
 	sb := &strings.Builder{}
@@ -644,10 +796,9 @@
 		if i > 0 {
 			sb.WriteString(`,`)
 		}
-		span := s.span
 		sb.WriteString(fmt.Sprintf("[%v,%v,%v,%v]",
-			span.Start.Line, span.Start.Column,
-			span.End.Line, span.End.Column))
+			s.span.Start.Line, s.span.Start.Column,
+			s.span.End.Line, s.span.End.Column))
 	}
 
 	sb.WriteString(`]`)
@@ -662,18 +813,62 @@
 
 	sb.WriteString(`{`)
 	for i, path := range paths {
+		file := t.files[path]
 		if i > 0 {
 			sb.WriteString(`,`)
 		}
 		sb.WriteString(`"`)
 		sb.WriteString(path)
 		sb.WriteString(`":`)
-		t.writeCoverageMapJSON(t.files[path], sb)
+		sb.WriteString(`{`)
+		sb.WriteString(`"g":`)
+		t.writeSpanGroupsJSON(file.spangroups, sb)
+		sb.WriteString(`,"c":`)
+		t.writeCoverageMapJSON(file.tcm, sb)
+		sb.WriteString(`}`)
 	}
 
 	sb.WriteString(`}`)
 }
 
+func (t *Tree) writeSpanGroupsJSON(spangroups map[SpanGroupID]SpanGroup, sb *strings.Builder) {
+	type groupAndID struct {
+		group SpanGroup
+		id    SpanGroupID
+	}
+	groups := make([]groupAndID, 0, len(spangroups))
+	for id, group := range spangroups {
+		groups = append(groups, groupAndID{group, id})
+	}
+	sort.Slice(groups, func(i, j int) bool { return groups[i].id < groups[j].id })
+
+	sb.WriteString(`[`)
+	for i, g := range groups {
+		if i > 0 {
+			sb.WriteString(`,`)
+		}
+		t.writeSpanGroupJSON(g.group, sb)
+	}
+	sb.WriteString(`]`)
+}
+
+func (t *Tree) writeSpanGroupJSON(group SpanGroup, sb *strings.Builder) {
+	sb.WriteString(`{`)
+	sb.WriteString(`"s":[`)
+	for i, spanID := range group.spans.List() {
+		if i > 0 {
+			sb.WriteString(`,`)
+		}
+		sb.WriteString(fmt.Sprintf("%v", spanID))
+	}
+	sb.WriteString(`]`)
+	if group.extend != nil {
+		sb.WriteString(`,"e":`)
+		sb.WriteString(fmt.Sprintf("%v", *group.extend))
+	}
+	sb.WriteString(`}`)
+}
+
 func (t *Tree) writeCoverageMapJSON(c TestCoverageMap, sb *strings.Builder) {
 	ids := make([]TestIndex, 0, len(c))
 	for id := range c {
@@ -710,6 +905,11 @@
 		sb.WriteString(`]`)
 		comma = true
 	}
+	if c.Group != nil {
+		sb.WriteString(`"g":`)
+		sb.WriteString(fmt.Sprintf("%v", *c.Group))
+		comma = true
+	}
 	if len(c.Children) > 0 {
 		if comma {
 			sb.WriteString(`,`)
@@ -746,6 +946,8 @@
 			p.parseTests(&p.tree.testRoot)
 		case "s":
 			p.parseSpans()
+		case "g":
+			p.parseSpanGroups()
 		case "f":
 			p.parseFiles()
 		default:
@@ -800,12 +1002,50 @@
 
 func (p *parser) parseFiles() {
 	p.dict(func(path string) {
-		file := TestCoverageMap{}
-		p.tree.files[path] = file
-		p.parseCoverageMap(file)
+		p.tree.files[path] = p.parseFile()
 	})
 }
 
+func (p *parser) parseFile() *treeFile {
+	file := newTreeFile()
+	if p.peek() == '{' {
+		p.dict(func(key string) {
+			switch key {
+			case "g":
+				file.spangroups = p.parseSpanGroups()
+			case "c":
+				p.parseCoverageMap(file.tcm)
+			default:
+				p.fail("Unknown file key: '%s'", key)
+			}
+		})
+	} else { // backwards compatibility
+		p.parseCoverageMap(file.tcm)
+	}
+	return file
+}
+
+func (p *parser) parseSpanGroups() map[SpanGroupID]SpanGroup {
+	spangroups := map[SpanGroupID]SpanGroup{}
+	p.array(func(groupIdx int) {
+		g := newSpanGroup()
+		p.dict(func(key string) {
+			switch key {
+			case "s":
+				p.array(func(spanIdx int) {
+					id := SpanID(p.integer())
+					g.spans[id] = struct{}{}
+				})
+			case "e":
+				extend := SpanGroupID(p.integer())
+				g.extend = &extend
+			}
+		})
+		spangroups[SpanGroupID(groupIdx)] = g
+	})
+	return spangroups
+}
+
 func (p *parser) parseCoverageMap(tcm TestCoverageMap) {
 	p.array(func(int) {
 		p.expect("[")
@@ -824,6 +1064,9 @@
 				id := SpanID(p.integer())
 				tc.Spans[id] = struct{}{}
 			})
+		case "g":
+			groupID := SpanGroupID(p.integer())
+			tc.Group = &groupID
 		case "c":
 			p.parseCoverageMap(tc.Children)
 		default:
diff --git a/tests/regres/cov/coverage_test.go b/tests/regres/cov/coverage_test.go
index 9e4d88b..aac05b5 100644
--- a/tests/regres/cov/coverage_test.go
+++ b/tests/regres/cov/coverage_test.go
@@ -225,7 +225,7 @@
 }
 
 func checkCoverage(t *testing.T, tree *cov.Tree, file string, expect string) {
-	g, e := tree.File(file).String(tree.Tests(), tree.Strings()), expect
+	g, e := tree.FileCoverage(file).String(tree.Tests(), tree.Strings()), expect
 	if tg, te := trimWS(g), trimWS(e); tg != te {
 		t.Errorf("Coverage not as expected.\nGot:\n%v\nExpect:\n%v\n------\nGot:    %v\nExpect: %v", g, e, tg, te)
 	}