From 0f685d8993a5e70bbffc31aaf06b1a93e7ffb583 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Mon, 20 Nov 2023 14:19:14 +0200 Subject: [PATCH] archive: e2e test for ranking against sourcegraph repo (#695) This is an initial framework for having golden file results for search results against a real repository. At first we have only added one query and one repository, but it should be straightforward to grow this list further. The golden files we write to disk are a summary of results. This matches how we have been using the zoekt CLI tool on the keyword branch during our ranking work. Test Plan: go test --- cmd/zoekt-archive-index/e2e_rank_test.go | 245 ++++++++++++++++++ .../testdata/graphql_type_User.txt | 40 +++ 2 files changed, 285 insertions(+) create mode 100644 cmd/zoekt-archive-index/e2e_rank_test.go create mode 100644 cmd/zoekt-archive-index/testdata/graphql_type_User.txt diff --git a/cmd/zoekt-archive-index/e2e_rank_test.go b/cmd/zoekt-archive-index/e2e_rank_test.go new file mode 100644 index 000000000..12a5f4fd5 --- /dev/null +++ b/cmd/zoekt-archive-index/e2e_rank_test.go @@ -0,0 +1,245 @@ +package main + +import ( + "bytes" + "context" + "flag" + "fmt" + "io" + "net/url" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/sourcegraph/zoekt" + "github.com/sourcegraph/zoekt/build" + "github.com/sourcegraph/zoekt/query" + "github.com/sourcegraph/zoekt/shards" +) + +var update = flag.Bool("update", false, "update golden file") + +// debugScore can be set to include much more output. Do not commit the +// updated golden files, this is purely used for debugging in a local +// environment. +var debugScore = flag.Bool("debug_score", false, "include debug output in golden files.") + +func TestRanking(t *testing.T) { + if testing.Short() { + t.Skip("skipping due to short flag") + } + + requireCTags(t) + + archiveURLs := []string{ + "https://github.com/sourcegraph/sourcegraph/tree/v5.2.2", + } + queries := []string{ + "graphql type User", + } + + indexDir := t.TempDir() + + for _, u := range archiveURLs { + if err := indexURL(indexDir, u); err != nil { + t.Fatal(err) + } + } + + ss, err := shards.NewDirectorySearcher(indexDir) + if err != nil { + t.Fatalf("NewDirectorySearcher(%s): %v", indexDir, err) + } + defer ss.Close() + + for _, queryStr := range queries { + // normalise queryStr for writing to fs + name := strings.Map(func(r rune) rune { + if strings.ContainsRune(" :", r) { + return '_' + } + if '0' <= r && r <= '9' || + 'a' <= r && r <= 'z' || + 'A' <= r && r <= 'Z' { + return r + } + return -1 + }, queryStr) + + t.Run(name, func(t *testing.T) { + q, err := query.Parse(queryStr) + if err != nil { + t.Fatal(err) + } + + sOpts := zoekt.SearchOptions{ + // Use the same options sourcegraph has by default + ChunkMatches: true, + MaxWallTime: 20 * time.Second, + ShardMaxMatchCount: 10_000 * 10, + TotalMaxMatchCount: 100_000 * 10, + MaxDocDisplayCount: 500, + + DebugScore: *debugScore, + } + result, err := ss.Search(context.Background(), q, &sOpts) + if err != nil { + t.Fatal(err) + } + + var gotBuf bytes.Buffer + marshalMatches(&gotBuf, queryStr, q, result.Files) + got := gotBuf.Bytes() + + wantPath := filepath.Join("testdata", name+".txt") + if *update { + if err := os.WriteFile(wantPath, got, 0600); err != nil { + t.Fatal(err) + } + } + want, err := os.ReadFile(wantPath) + if err != nil { + t.Fatal(err) + } + + if d := cmp.Diff(string(want), string(got)); d != "" { + t.Fatalf("unexpected (-want, +got):\n%s", d) + } + }) + } +} + +var tarballCache = "/tmp/zoekt-test-ranking-tarballs-" + os.Getenv("USER") + +func indexURL(indexDir, u string) error { + if err := os.MkdirAll(tarballCache, 0700); err != nil { + return err + } + + opts := Options{ + Archive: u, + } + opts.SetDefaults() // sets metadata like Name and the codeload URL + u = opts.Archive + + // update Archive location to cached location + cacheBase := fmt.Sprintf("%s-%s%s.tar.gz", url.QueryEscape(opts.Name), opts.Branch, opts.Commit) // assume .tar.gz + path := filepath.Join(tarballCache, cacheBase) + opts.Archive = path + + if _, err := os.Stat(path); os.IsNotExist(err) { + if err := download(u, path); err != nil { + return err + } + } + + // TODO scip + // languageMap := make(ctags.LanguageMap) + // for _, lang := range []string{"kotlin", "rust", "ruby", "go", "python", "javascript", "c_sharp", "scala", "typescript", "zig"} { + // languageMap[lang] = ctags.ScipCTags + // } + + err := do(opts, build.Options{ + IndexDir: indexDir, + CTagsMustSucceed: true, + }) + if err != nil { + return fmt.Errorf("failed to index %s: %w", opts.Archive, err) + } + + return nil +} + +func download(url, dst string) error { + tmpPath := dst + ".part" + + rc, err := openReader(url) + if err != nil { + return err + } + defer rc.Close() + + f, err := os.OpenFile(tmpPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600) + if err != nil { + return err + } + defer f.Close() + + _, err = io.Copy(f, rc) + if err != nil { + return err + } + + err = f.Close() + if err != nil { + return err + } + + return os.Rename(tmpPath, dst) +} + +const ( + chunkMatchesPerFile = 3 + fileMatchesPerSearch = 6 +) + +func marshalMatches(w io.Writer, queryStr string, q query.Q, files []zoekt.FileMatch) { + _, _ = fmt.Fprintf(w, "queryString: %s\n", queryStr) + _, _ = fmt.Fprintf(w, "query: %s\n\n", q) + + files, hiddenFiles := splitAtIndex(files, fileMatchesPerSearch) + for _, f := range files { + _, _ = fmt.Fprintf(w, "%s/%s%s\n", f.Repository, f.FileName, addTabIfNonEmpty(f.Debug)) + + chunks, hidden := splitAtIndex(f.ChunkMatches, chunkMatchesPerFile) + + for _, m := range chunks { + _, _ = fmt.Fprintf(w, "%d:%s%s\n", m.ContentStart.LineNumber, string(m.Content), addTabIfNonEmpty(m.DebugScore)) + } + + if len(hidden) > 0 { + _, _ = fmt.Fprintf(w, "hidden %d more line matches\n", len(hidden)) + } + _, _ = fmt.Fprintln(w) + } + + if len(hiddenFiles) > 0 { + fmt.Fprintf(w, "hidden %d more file matches\n", len(hiddenFiles)) + } +} + +func splitAtIndex[E any](s []E, idx int) ([]E, []E) { + if idx < len(s) { + return s[:idx], s[idx:] + } + return s, nil +} + +func addTabIfNonEmpty(s string) string { + if s != "" { + return "\t" + s + } + return s +} + +func requireCTags(tb testing.TB) { + tb.Helper() + + if os.Getenv("CTAGS_COMMAND") != "" { + return + } + if _, err := exec.LookPath("universal-ctags"); err == nil { + return + } + + // On CI we require ctags to be available. Otherwise we skip + if os.Getenv("CI") != "" { + tb.Fatal("universal-ctags is missing") + } else { + tb.Skip("universal-ctags is missing") + } +} diff --git a/cmd/zoekt-archive-index/testdata/graphql_type_User.txt b/cmd/zoekt-archive-index/testdata/graphql_type_User.txt new file mode 100644 index 000000000..5c1e21484 --- /dev/null +++ b/cmd/zoekt-archive-index/testdata/graphql_type_User.txt @@ -0,0 +1,40 @@ +queryString: graphql type User +query: (and substr:"graphql" substr:"type" case_substr:"User") + +github.com/sourcegraph/sourcegraph/cmd/frontend/graphqlbackend/schema.graphql +6376:type User implements Node & SettingsSubject & Namespace { +3862: type: GitRefType +5037: type: GitRefType! +hidden 460 more line matches + +github.com/sourcegraph/sourcegraph/internal/types/types.go +850:type User struct { +1372: Type *SearchCountStatistics +1766: Type string +hidden 234 more line matches + +github.com/sourcegraph/sourcegraph/client/web/src/enterprise/insights/core/backend/gql-backend/methods/get-dashboard-owners.ts +22: type: InsightsDashboardOwnerType.Global, +32: type: InsightsDashboardOwnerType.Personal, +18: const { currentUser, site } = data +hidden 8 more line matches + +github.com/sourcegraph/sourcegraph/cmd/frontend/graphqlbackend/apitest/types.go +47:type User struct { +9: Typename string `json:"__typename"` +32: Typename string `json:"__typename"` +hidden 11 more line matches + +github.com/sourcegraph/sourcegraph/cmd/frontend/internal/batches/resolvers/apitest/types.go +52:type User struct { +364: User *User +393: Type string +hidden 68 more line matches + +github.com/sourcegraph/sourcegraph/internal/extsvc/github/common.go +2030:type User struct { +66: User *Actor `json:"User,omitempty"` +527: Type string +hidden 136 more line matches + +hidden 494 more file matches