Skip to content

Commit

Permalink
compute precise language information with go-enry for lang: queries (#…
Browse files Browse the repository at this point in the history
…220)

use go-enry to compute more precise language information than ctags

make lang: use filename fallback for older index versions
  • Loading branch information
Ryan Hitchman authored Dec 8, 2021
1 parent f40c58e commit d86fb30
Show file tree
Hide file tree
Showing 19 changed files with 152 additions and 32 deletions.
2 changes: 1 addition & 1 deletion api.go
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ type IndexMetadata struct {
IndexMinReaderVersion int
IndexTime time.Time
PlainASCII bool
LanguageMap map[string]byte
LanguageMap map[string]uint16
ZoektVersion string
ID string
}
Expand Down
5 changes: 0 additions & 5 deletions build/ctags.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"os"
"os/exec"
"path/filepath"
"strings"
"time"

"github.com/google/zoekt"
Expand Down Expand Up @@ -152,7 +151,6 @@ func ctagsAddSymbolsParser(todo []*zoekt.Document, parser ctags.Parser) error {
if len(es) == 0 {
continue
}
doc.Language = strings.ToLower(es[0].Language)

symOffsets, symMetaData, err := tagsToSections(doc.Content, es)
if err != nil {
Expand Down Expand Up @@ -205,9 +203,6 @@ func ctagsAddSymbols(todo []*zoekt.Document, parser ctags.Parser, bin string) er
}
todo[pathIndices[k]].Symbols = symOffsets
todo[pathIndices[k]].SymbolsMetaData = symMetaData
if len(tags) > 0 {
todo[pathIndices[k]].Language = strings.ToLower(tags[0].Language)
}
}
return nil
}
Expand Down
34 changes: 33 additions & 1 deletion eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@ import (
"context"
"fmt"
"log"
"regexp"
"regexp/syntax"
"sort"
"strings"

enry_data "github.com/go-enry/go-enry/v2/data"
"github.com/google/zoekt/query"
)

Expand Down Expand Up @@ -98,6 +100,36 @@ func (d *indexData) simplify(in query.Q) query.Q {
})
case *query.Language:
_, has := d.metaData.LanguageMap[r.Language]
if !has && d.metaData.IndexFeatureVersion < 12 {
// For index files that haven't been re-indexed by go-enry,
// fall back to file-based matching and continue even if this
// repo doesn't have the specific language present.
extsForLang := enry_data.ExtensionsByLanguage[r.Language]
if extsForLang != nil {
extFrags := make([]string, 0, len(extsForLang))
for _, ext := range extsForLang {
extFrags = append(extFrags, regexp.QuoteMeta(ext))
}
if len(extFrags) > 0 {
pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|"))
// inlined copy of query.regexpQuery
re, err := syntax.Parse(pattern, syntax.Perl)
if err != nil {
return &query.Const{Value: false}
}
if re.Op == syntax.OpLiteral {
return &query.Substring{
Pattern: string(re.Rune),
FileName: true,
}
}
return &query.Regexp{
Regexp: re,
FileName: true,
}
}
}
}
if !has {
return &query.Const{Value: false}
}
Expand Down Expand Up @@ -238,7 +270,7 @@ nextFileMatch:
RepositoryPriority: md.priority,
FileName: string(d.fileName(nextDoc)),
Checksum: d.getChecksum(nextDoc),
Language: d.languageMap[d.languages[nextDoc]],
Language: d.languageMap[d.getLanguage(nextDoc)],
}

if s := d.subRepos[nextDoc]; s > 0 {
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ require (
github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd // indirect
github.com/fsnotify/fsnotify v1.4.9
github.com/gfleury/go-bitbucket-v1 v0.0.0-20200312180434-e5170e3280fb
github.com/go-enry/go-enry/v2 v2.8.0
github.com/go-git/go-git/v5 v5.4.2
github.com/gobwas/glob v0.2.3
github.com/google/go-cmp v0.5.5
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@ github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeME
github.com/gliderlabs/ssh v0.2.2 h1:6zsha5zo/TWhRhwqCD3+EarCAgZ2yN28ipRnGPnwkI0=
github.com/gliderlabs/ssh v0.2.2/go.mod h1:U7qILu1NlMHj9FlMhZLlkCdDnU1DBEAqr0aevW3Awn0=
github.com/go-critic/go-critic v0.4.1/go.mod h1:7/14rZGnZbY6E38VEGk2kVhoq6itzc1E68facVDK23g=
github.com/go-enry/go-enry/v2 v2.8.0 h1:KMW4mSG+8uUF6FaD3iPkFqyfC5tF8gRrsYImq6yhHzo=
github.com/go-enry/go-enry/v2 v2.8.0/go.mod h1:GVzIiAytiS5uT/QiuakK7TF1u4xDab87Y8V5EJRpsIQ=
github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo=
github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
github.com/go-git/gcfg v1.5.0 h1:Q5ViNfGF8zFgyJWPqYwA7qGFoMTEiBmdlkcfRmpIMa4=
github.com/go-git/gcfg v1.5.0/go.mod h1:5m20vg6GwYabIxaOonVkTdrILxQMpEShl1xiMF4ua+E=
github.com/go-git/go-billy/v5 v5.2.0/go.mod h1:pmpqyWchKfYfrkb/UVH4otLvyi/5gJlGI4Hb3ZqZ3W0=
Expand Down
55 changes: 55 additions & 0 deletions index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ func testIndexBuilder(t *testing.T, repo *Repository, docs ...Document) *IndexBu
t.Fatalf("Add %d: %v", i, err)
}
}

return b
}

Expand Down Expand Up @@ -2137,3 +2138,57 @@ func TestSearchTypeFileName(t *testing.T) {
})
wantSingleMatch(res, "f2")
}

func TestSearchTypeLanguage(t *testing.T) {
b := testIndexBuilder(t, &Repository{
Name: "reponame",
},
Document{Name: "apex.cls", Content: []byte("public class Car extends Vehicle {")},
Document{Name: "tex.cls", Content: []byte(`\DeclareOption*{`)},
Document{Name: "hello.h", Content: []byte(`#include <stdio.h>`)},
)

t.Log(b.languageMap)

wantSingleMatch := func(res *SearchResult, want string) {
t.Helper()
fmatches := res.Files
if len(fmatches) != 1 {
t.Errorf("got %v, want 1 matches", len(fmatches))
return
}
if len(fmatches[0].LineMatches) != 1 {
t.Errorf("got %d line matches", len(fmatches[0].LineMatches))
return
}
var got string
if fmatches[0].LineMatches[0].FileName {
got = fmatches[0].FileName
} else {
got = fmt.Sprintf("%s:%d", fmatches[0].FileName, fmatches[0].LineMatches[0].LineFragments[0].Offset)
}

if got != want {
t.Errorf("got %s, want %s", got, want)
}
}

res := searchForTest(t, b, &query.Language{Language: "Apex"})
wantSingleMatch(res, "apex.cls")

res = searchForTest(t, b, &query.Language{Language: "TeX"})
wantSingleMatch(res, "tex.cls")

res = searchForTest(t, b, &query.Language{Language: "C"})
wantSingleMatch(res, "hello.h")

// test fallback language search by pretending it's an older index version
res = searchForTest(t, b, &query.Language{Language: "C++"})
if len(res.Files) != 0 {
t.Errorf("got %d results for C++, want 0", len(res.Files))
}

b.featureVersion = 11 // force fallback
res = searchForTest(t, b, &query.Language{Language: "C++"})
wantSingleMatch(res, "hello.h")
}
25 changes: 18 additions & 7 deletions indexbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ import (
"sort"
"time"
"unicode/utf8"

"github.com/go-enry/go-enry/v2"
)

var _ = log.Println
Expand Down Expand Up @@ -187,10 +189,10 @@ type IndexBuilder struct {
subRepoIndices []map[string]uint32

// language => language code
languageMap map[string]byte
languageMap map[string]uint16

// languages codes
languages []byte
// language codes, uint16 encoded as little-endian
languages []uint8

// IndexTime will be used as the time if non-zero. Otherwise
// time.Now(). This is useful for doing reproducible builds in tests.
Expand Down Expand Up @@ -242,7 +244,7 @@ func newIndexBuilder() *IndexBuilder {
fileEndSymbol: []uint32{0},
symIndex: make(map[string]uint32),
symKindIndex: make(map[string]uint32),
languageMap: map[string]byte{},
languageMap: map[string]uint16{},
}
}

Expand Down Expand Up @@ -425,6 +427,15 @@ func (b *IndexBuilder) Add(doc Document) error {
}
}

if doc.Language == "" {
c := doc.Content
// classifier is faster on small files without losing much accuracy
if len(c) > 2048 {
c = c[:2048]
}
doc.Language = enry.GetLanguage(doc.Name, c)
}

sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
var last DocumentSection
for i, s := range doc.Symbols {
Expand Down Expand Up @@ -492,13 +503,13 @@ func (b *IndexBuilder) Add(doc Document) error {

langCode, ok := b.languageMap[doc.Language]
if !ok {
if len(b.languageMap) >= 255 {
if len(b.languageMap) >= 65535 {
return fmt.Errorf("too many languages")
}
langCode = byte(len(b.languageMap))
langCode = uint16(len(b.languageMap))
b.languageMap[doc.Language] = langCode
}
b.languages = append(b.languages, langCode)
b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8))

return nil
}
Expand Down
11 changes: 10 additions & 1 deletion indexdata.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ type indexData struct {
languages []byte

// inverse of LanguageMap in metaData
languageMap map[byte]string
languageMap map[uint16]string

repoListEntry []RepoListEntry

Expand Down Expand Up @@ -166,6 +166,15 @@ func (d *indexData) getChecksum(idx uint32) []byte {
return d.checksums[start : start+crc64.Size]
}

func (d *indexData) getLanguage(idx uint32) uint16 {
if d.metaData.IndexFeatureVersion < 12 {
// older zoekt files had 8-bit language entries
return uint16(d.languages[idx])
}
// newer zoekt files have 16-bit language entries
return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
}

// calculates stats for files in the range [start, end).
func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats {
if start >= end {
Expand Down
8 changes: 6 additions & 2 deletions matchtree.go
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,11 @@ func (t *andMatchTree) String() string {
}

func (t *regexpMatchTree) String() string {
return fmt.Sprintf("re(%s)", t.regexp)
f := ""
if t.fileName {
f = "f"
}
return fmt.Sprintf("%sre(%s)", f, t.regexp)
}

func (t *orMatchTree) String() string {
Expand Down Expand Up @@ -874,7 +878,7 @@ func (d *indexData) newMatchTree(q query.Q) (matchTree, error) {
reason: "language",
numDocs: d.numDocs(),
predicate: func(docID uint32) bool {
return d.languages[docID] == code
return d.getLanguage(docID) == code
},
}, nil

Expand Down
2 changes: 1 addition & 1 deletion merge.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ func merge(ds ...*indexData) (*IndexBuilder, error) {
// Content set below since it can return an error
// Branches set below since it requires lookups
SubRepositoryPath: d.subRepoPaths[repoID][d.subRepos[docID]],
Language: d.languageMap[d.languages[docID]],
Language: d.languageMap[d.getLanguage(docID)],
// SkipReason not set, will be part of content from original indexer.
}

Expand Down
9 changes: 8 additions & 1 deletion query/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ import (
"fmt"
"log"
"regexp/syntax"

"github.com/go-enry/go-enry/v2"
)

var _ = log.Printf
Expand Down Expand Up @@ -138,7 +140,12 @@ func parseExpr(in []byte) (Q, int, error) {
}
expr = q
case tokLang:
expr = &Language{Language: text}
canonical, ok := enry.GetLanguageByAlias(text)
if !ok {
expr = &Const{false}
} else {
expr = &Language{Language: canonical}
}

case tokSym:
if text == "" {
Expand Down
3 changes: 2 additions & 1 deletion query/parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ func TestParseQuery(t *testing.T) {
{"c:abc", &Substring{Pattern: "abc", Content: true}},
{"content:abc", &Substring{Pattern: "abc", Content: true}},

{"lang:c++", &Language{"c++"}},
{"lang:c++", &Language{"C++"}},
{"lang:cpp", &Language{"C++"}},
{"sym:pqr", &Symbol{&Substring{Pattern: "pqr"}}},
{"sym:Pqr", &Symbol{&Substring{Pattern: "Pqr", CaseSensitive: true}}},
{"sym:.*", &Symbol{&Regexp{Regexp: mustParseRE(".*")}}},
Expand Down
2 changes: 1 addition & 1 deletion read.go
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
d.subRepoPaths = append(d.subRepoPaths, keys)
}

d.languageMap = map[byte]string{}
d.languageMap = map[uint16]string{}
for k, v := range d.metaData.LanguageMap {
d.languageMap[v] = k
}
Expand Down
8 changes: 4 additions & 4 deletions testdata/gen-shards.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

set -ex

go build ../cmd/zoekt-index

cp -r repo repo17

./zoekt-index -disable_ctags repo17
go run ../cmd/zoekt-index -disable_ctags repo17
go run ../cmd/zoekt-merge-index repo17_v16.00000.zoekt
mv compound*zoekt repo17_v17.00000.zoekt

rm -rf repo17
rm -rf repo17 repo17_v16.00000.zoekt zoekt-builder-shard-log.tsv

mv *.zoekt shards/
6 changes: 3 additions & 3 deletions testdata/golden/TestReadSearch/repo17_v17.00000.golden
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"FormatVersion": 17,
"FeatureVersion": 11,
"FeatureVersion": 12,
"FileMatches": [
[
{
Expand Down Expand Up @@ -29,7 +29,7 @@
],
"Content": null,
"Checksum": "n9fUYqacPXg=",
"Language": "",
"Language": "Go",
"SubRepositoryName": "",
"SubRepositoryPath": "",
"Version": ""
Expand Down Expand Up @@ -62,7 +62,7 @@
],
"Content": null,
"Checksum": "n9fUYqacPXg=",
"Language": "",
"Language": "Go",
"SubRepositoryName": "",
"SubRepositoryPath": "",
"Version": ""
Expand Down
Loading

0 comments on commit d86fb30

Please sign in to comment.