From 8d28d7277b93a1729b9e8a5d935d9007a5c14c77 Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Fri, 13 Dec 2024 13:14:13 +0100 Subject: [PATCH 1/2] refactor shard name logic We had 2 copies of this logic in the code and this bit me when I introduced id-based shards indexbuilder.go seems an "ok" place to put. We cannot put it in the shards package because of circular dependencies. Test plan: refactor, so relying on CI --- build/builder.go | 14 +----- .../cleanup_test.go | 3 +- indexbuilder.go | 10 ++++ indexbuilder_test.go | 49 +++++++++++++++++++ merge.go | 18 ++----- read_test.go | 6 +-- 6 files changed, 68 insertions(+), 32 deletions(-) create mode 100644 indexbuilder_test.go diff --git a/build/builder.go b/build/builder.go index 503b2f5c1..9f0c65c97 100644 --- a/build/builder.go +++ b/build/builder.go @@ -21,7 +21,6 @@ import ( "crypto/sha1" "flag" "fmt" - "io" "log" "net/url" "os" @@ -334,12 +333,6 @@ func (o *Options) SetDefaults() { } } -func hashString(s string) string { - h := sha1.New() - _, _ = io.WriteString(h, s) - return fmt.Sprintf("%x", h.Sum(nil)) -} - // ShardName returns the name the given index shard. func (o *Options) shardName(n int) string { return o.shardNameVersion(zoekt.IndexFormatVersion, n) @@ -347,12 +340,7 @@ func (o *Options) shardName(n int) string { func (o *Options) shardNameVersion(version, n int) string { prefix := url.QueryEscape(cmp.Or(o.ShardPrefix, o.RepositoryDescription.Name)) - if len(prefix) > 200 { - prefix = prefix[:200] + hashString(prefix)[:8] - } - shardName := filepath.Join(o.IndexDir, - fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n)) - return shardName + return zoekt.ShardName(o.IndexDir, prefix, version, n) } type IndexState string diff --git a/cmd/zoekt-sourcegraph-indexserver/cleanup_test.go b/cmd/zoekt-sourcegraph-indexserver/cleanup_test.go index 6514b8ffd..5a1fa7666 100644 --- a/cmd/zoekt-sourcegraph-indexserver/cleanup_test.go +++ b/cmd/zoekt-sourcegraph-indexserver/cleanup_test.go @@ -2,7 +2,6 @@ package main import ( "fmt" - "net/url" "os" "path/filepath" "reflect" @@ -23,7 +22,7 @@ func TestCleanup(t *testing.T) { return shard{ RepoID: fakeID(name), RepoName: name, - Path: fmt.Sprintf("%s_v%d.%05d.zoekt", url.QueryEscape(name), 15, n), + Path: zoekt.ShardName("", name, 15, n), ModTime: mtime, RepoTombstone: false, } diff --git a/indexbuilder.go b/indexbuilder.go index 966a92c5d..027edf9f4 100644 --- a/indexbuilder.go +++ b/indexbuilder.go @@ -583,3 +583,13 @@ func (t *DocChecker) clearTrigrams(maxTrigramCount int) { delete(t.trigrams, key) } } + +// ShardName returns the name of the shard for the given prefix, version, and +// shard number. +func ShardName(indexDir string, prefix string, version, n int) string { + prefix = url.QueryEscape(prefix) + if len(prefix) > 200 { + prefix = prefix[:200] + hashString(prefix)[:8] + } + return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n)) +} diff --git a/indexbuilder_test.go b/indexbuilder_test.go new file mode 100644 index 000000000..7487a5ad3 --- /dev/null +++ b/indexbuilder_test.go @@ -0,0 +1,49 @@ +package zoekt + +import ( + "strings" + "testing" +) + +func TestShardName(t *testing.T) { + tests := []struct { + name string + indexDir string + prefix string + version int + shardNum int + expected string + }{ + { + name: "short prefix", + indexDir: "index", + prefix: "short", + version: 1, + shardNum: 42, + expected: "index/short_v1.00042.zoekt", + }, + { + name: "long prefix truncated", + indexDir: "index", + prefix: strings.Repeat("a", 300), + version: 2, + shardNum: 1, + expected: "index/" + strings.Repeat("a", 200) + "003ef1ba" + "_v2.00001.zoekt", + }, + { + name: "empty indexDir", + prefix: "short", + version: 1, + expected: "short_v1.00000.zoekt", + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + actual := ShardName(test.indexDir, test.prefix, test.version, test.shardNum) + if actual != test.expected { + t.Errorf("expected %q, got %q", test.expected, actual) + } + }) + } +} diff --git a/merge.go b/merge.go index 8b2f17342..0473663e1 100644 --- a/merge.go +++ b/merge.go @@ -5,7 +5,6 @@ import ( "fmt" "io" "log" - "net/url" "os" "path/filepath" "runtime" @@ -170,10 +169,10 @@ func explode(dstDir string, f IndexFile, ibFuncs ...indexBuilderFunc) (map[strin prefix = ib.repoList[0].Name } - fn := filepath.Join(dstDir, shardName(prefix, ib.indexFormatVersion, 0)) - fnTmp := fn + ".tmp" - shardNames[fnTmp] = fn - return builderWriteAll(fnTmp, ib) + shardName := ShardName(dstDir, prefix, ib.indexFormatVersion, 0) + shardNameTmp := shardName + ".tmp" + shardNames[shardNameTmp] = shardName + return builderWriteAll(shardNameTmp, ib) } var ib *IndexBuilder @@ -264,12 +263,3 @@ func hashString(s string) string { _, _ = io.WriteString(h, s) return fmt.Sprintf("%x", h.Sum(nil)) } - -// copied from builder package to avoid circular imports. -func shardName(prefix string, version, n int) string { - abs := url.QueryEscape(prefix) - if len(abs) > 200 { - abs = abs[:200] + hashString(abs)[:8] - } - return fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n) -} diff --git a/read_test.go b/read_test.go index e87c643af..7b3827b1d 100644 --- a/read_test.go +++ b/read_test.go @@ -370,10 +370,10 @@ func TestBackwardsCompat(t *testing.T) { t.Fatal(err) } - outname := fmt.Sprintf("testdata/backcompat/new_v%d.%05d.zoekt", IndexFormatVersion, 0) - t.Log("writing new file", outname) + outName := ShardName("testdata/backcompat", "new", IndexFormatVersion, 0) + t.Log("writing new file", outName) - err = os.WriteFile(outname, buf.Bytes(), 0o644) + err = os.WriteFile(outName, buf.Bytes(), 0o644) if err != nil { t.Fatalf("Creating output file: %v", err) } From 2aa59dae832aa46b2bac2e3b09c029b99990b9a9 Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Wed, 18 Dec 2024 12:59:41 +0100 Subject: [PATCH 2/2] don't call url.QueryEscape twice --- build/builder.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/build/builder.go b/build/builder.go index 9f0c65c97..498318699 100644 --- a/build/builder.go +++ b/build/builder.go @@ -339,8 +339,7 @@ func (o *Options) shardName(n int) string { } func (o *Options) shardNameVersion(version, n int) string { - prefix := url.QueryEscape(cmp.Or(o.ShardPrefix, o.RepositoryDescription.Name)) - return zoekt.ShardName(o.IndexDir, prefix, version, n) + return zoekt.ShardName(o.IndexDir, cmp.Or(o.ShardPrefix, o.RepositoryDescription.Name), version, n) } type IndexState string