Skip to content

Commit

Permalink
feat: Minimize allocations in SCIP symbol parsing.
Browse files Browse the repository at this point in the history
See PR for benchmarks.
  • Loading branch information
varungandhi-src committed Dec 2, 2024
1 parent 79fb777 commit c269d2b
Show file tree
Hide file tree
Showing 26 changed files with 3,570 additions and 2,410 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/golang.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ jobs:
- uses: ./.github/actions/asdf
with:
golang: true
- run: go test ./... -v
- run: go test ./... -v -tags asserts
1 change: 1 addition & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ reprolang/target
reprolang/src/grammar.json
reprolang/src/node-types.json
bindings/typescript
docs/scip.md
.bin
2 changes: 1 addition & 1 deletion .tool-versions
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
golang 1.20.14
golang 1.22.0
nodejs 16.20.2
shellcheck 0.7.1
yarn 1.22.22
Expand Down
19 changes: 19 additions & 0 deletions Development.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
- [Project structure](#project-structure)
- [Code generation](#code-generation)
- [Debugging](#debugging)
- [Benchmarking](#benchmarking)
- [Testing and adding new SCIP semantics](#testing-and-adding-new-scip-semantics)
- [Release a new version](#release-a-new-version)

Expand Down Expand Up @@ -69,6 +70,24 @@ and is not recommended for use in other settings.
scip lint /path/to/index.scip
```

## Benchmarking

For benchmarks, one can put test SCIP indexes under `dev/sample_indexes`.

Sourcegraph teammates can download several large indexes
from this [Google drive folder](https://drive.google.com/drive/folders/1z62Se7eHaa5T89a16-y7s0Z1qbRY4VCg).

After that you can run:

```bash
go run ./bindings/go/scip/speedtest
```

to see the results.

Make sure to share benchmark results when making changes to
the symbol parsing logic.

## Testing and adding new SCIP semantics

It is helpful to use reprolang to check the existing code navigation behavior,
Expand Down
9 changes: 9 additions & 0 deletions bindings/go/scip/assertions.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//go:build asserts

package scip

func assert(cond bool, msg string) {
if !cond {
panic(msg)
}
}
6 changes: 6 additions & 0 deletions bindings/go/scip/assertions_noop.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
//go:build !asserts

package scip

// assert is a noop in release builds - the implementation is in assertions.go
func assert(cond bool, msg string) {}
72 changes: 72 additions & 0 deletions bindings/go/scip/internal/compat_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package internal

import (
"io"
"os"
"path/filepath"
"sync/atomic"
"testing"

"github.com/sourcegraph/beaut"
"github.com/sourcegraph/beaut/lib/knownwf"
conciter "github.com/sourcegraph/conc/iter"
"github.com/sourcegraph/scip/bindings/go/scip"
"github.com/sourcegraph/scip/bindings/go/scip/internal/shared"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/proto"
)

func TestParseCompat(t *testing.T) {
for _, path := range shared.SampleIndexes() {
t.Run(filepath.Base(path), func(t *testing.T) {
t.Parallel()
scipReader, err := os.Open(path)
require.Nil(t, err)
scipBytes, err := io.ReadAll(scipReader)
require.Nil(t, err)
scipIndex := scip.Index{}
require.NoError(t, proto.Unmarshal(scipBytes, &scipIndex))
var total atomic.Int64
conciter.ForEach(scipIndex.Documents, func(docPtr **scip.Document) {
document := *docPtr
if total.Load() > 1000*1000 {
return
}
total.Add(int64(len(document.Occurrences)))
var newSym scip.Symbol
for i := 0; i < len(document.Occurrences); i++ {
occ := document.Occurrences[i]
oldSym, oldErr := ParsePartialSymbolV1ToBeDeleted(occ.Symbol, true)
var newErr error
require.NotPanics(t, func() {
str := beaut.NewUTF8StringUnchecked(occ.Symbol, knownwf.UTF8DeserializedFromProtobufString)
newErr = scip.ParseSymbolUTF8With(str, scip.ParseSymbolOptions{
IncludeDescriptors: true,
RecordOutput: &newSym,
})
}, "panic for symbol: %q", occ.Symbol)
if oldErr != nil {
require.Error(t, newErr,
"old parser gave error %v but parse was successful with new parser (symbol: %q)",
oldErr.Error(), occ.Symbol)
continue
} else if newErr != nil {
require.NoError(t, newErr,
"new parser gave error %v but parse was successful with old parser (symbol: %q)",
newErr.Error(), occ.Symbol)
}
require.Equal(t, oldSym.Scheme, newSym.Scheme)
require.Equal(t, oldSym.Package, newSym.Package)
require.Equalf(t, len(oldSym.Descriptors), len(newSym.Descriptors), "symbol: %v, d1: %+v, d2: %+v", occ.Symbol,
oldSym.Descriptors, newSym.Descriptors)
for i, d := range oldSym.Descriptors {
dnew := newSym.Descriptors[i]
require.Equal(t, d.Name, dnew.Name, "symbol: %v", occ.Symbol)
require.Equal(t, d.Suffix, dnew.Suffix, "symbol: %v", occ.Symbol)
require.Equal(t, d.Disambiguator, dnew.Disambiguator, "symbol: %v", occ.Symbol)
}
}
})
})
}
}
239 changes: 239 additions & 0 deletions bindings/go/scip/internal/old_symbol_parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
package internal

import (
"fmt"
"strings"

"github.com/cockroachdb/errors"
"github.com/sourcegraph/scip/bindings/go/scip"
"github.com/sourcegraph/scip/bindings/go/scip/internal/shared"
)

func tryParseLocalSymbol(symbol string) (string, error) {
if !strings.HasPrefix(symbol, "local ") {
return "", nil
}
suffix := symbol[6:]
if len(suffix) > 0 && shared.IsSimpleIdentifier(suffix) {
return suffix, nil
}
return "", errors.Newf("expected format 'local <simple-identifier>' but got: %v", symbol)
}

// ParsePartialSymbolV1ToBeDeleted parses an SCIP string into the Symbol message
// with the option to exclude the `.Descriptor` field.
//
// Nov 30 2024: This is currently only present for benchmarking + compatibility
// reasons. We can remove this in the future once we're confident that the new
// parser handles everything correctly.
func ParsePartialSymbolV1ToBeDeleted(symbol string, includeDescriptors bool) (*scip.Symbol, error) {
// TODO: Rip out this, and call
local, err := tryParseLocalSymbol(symbol)
if err != nil {
return nil, err
}
if local != "" {
return &scip.Symbol{
Scheme: "local",
Descriptors: []*scip.Descriptor{
{Name: local, Suffix: scip.Descriptor_Local},
},
}, nil
}
s := newSymbolParser(symbol)
scheme, err := s.acceptSpaceEscapedIdentifier("scheme")
if err != nil {
return nil, err
}
manager, err := s.acceptSpaceEscapedIdentifier("package manager")
if err != nil {
return nil, err
}
if manager == "." {
manager = ""
}
packageName, err := s.acceptSpaceEscapedIdentifier("package name")
if err != nil {
return nil, err
}
if packageName == "." {
packageName = ""
}
packageVersion, err := s.acceptSpaceEscapedIdentifier("package version")
if err != nil {
return nil, err
}
if packageVersion == "." {
packageVersion = ""
}
var descriptors []*scip.Descriptor
if includeDescriptors {
descriptors, err = s.parseDescriptors()
}
return &scip.Symbol{
Scheme: scheme,
Package: &scip.Package{
Manager: manager,
Name: packageName,
Version: packageVersion,
},
Descriptors: descriptors,
}, err
}

type symbolParser struct {
Symbol []rune
index int
SymbolString string
}

func newSymbolParser(symbol string) *symbolParser {
return &symbolParser{
SymbolString: symbol,
Symbol: []rune(symbol),
index: 0,
}
}

func (s *symbolParser) error(message string) error {
return errors.Newf("%s\n%s\n%s^", message, s.SymbolString, strings.Repeat("_", s.index))
}

func (s *symbolParser) current() rune {
if s.index < len(s.Symbol) {
return s.Symbol[s.index]
}
return '\x00'
}

func (s *symbolParser) peekNext() rune {
if s.index+1 < len(s.Symbol) {
return s.Symbol[s.index]
}
return 0
}

func (s *symbolParser) parseDescriptors() ([]*scip.Descriptor, error) {
var result []*scip.Descriptor
for s.index < len(s.Symbol) {
descriptor, err := s.parseDescriptor()
if err != nil {
return nil, err
}
result = append(result, descriptor)
}
return result, nil
}

func (s *symbolParser) parseDescriptor() (*scip.Descriptor, error) {
start := s.index
switch s.peekNext() {
case '(':
s.index++
name, err := s.acceptIdentifier("parameter name")
if err != nil {
return nil, err
}
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Parameter}, s.acceptCharacter(')', "closing parameter name")
case '[':
s.index++
name, err := s.acceptIdentifier("type parameter name")
if err != nil {
return nil, err
}
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_TypeParameter}, s.acceptCharacter(']', "closing type parameter name")
default:
name, err := s.acceptIdentifier("descriptor name")
if err != nil {
return nil, err
}
suffix := s.current()
s.index++
switch suffix {
case '(':
disambiguator := ""
if s.peekNext() != ')' {
disambiguator, err = s.acceptIdentifier("method disambiguator")
if err != nil {
return nil, err
}
}
err = s.acceptCharacter(')', "closing method")
if err != nil {
return nil, err
}
return &scip.Descriptor{Name: name, Disambiguator: disambiguator, Suffix: scip.Descriptor_Method}, s.acceptCharacter('.', "closing method")
case '/':
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Namespace}, nil
case '.':
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Term}, nil
case '#':
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Type}, nil
case ':':
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Meta}, nil
case '!':
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Macro}, nil
default:
}
}

end := s.index
if s.index > len(s.Symbol) {
end = len(s.Symbol)
}
return nil, errors.Newf("unrecognized descriptor %q", string(s.Symbol[start:end]))
}

func (s *symbolParser) acceptIdentifier(what string) (string, error) {
if s.current() == '`' {
s.index++
return s.acceptBacktickEscapedIdentifier(what)
}
start := s.index
for s.index < len(s.Symbol) && shared.IsSimpleIdentifierCharacter(s.current()) {
s.index++
}
if start == s.index {
return "", s.error("empty identifier")
}
return string(s.Symbol[start:s.index]), nil
}

func (s *symbolParser) acceptSpaceEscapedIdentifier(what string) (string, error) {
return s.acceptEscapedIdentifier(what, ' ')
}

func (s *symbolParser) acceptBacktickEscapedIdentifier(what string) (string, error) {
return s.acceptEscapedIdentifier(what, '`')
}

func (s *symbolParser) acceptEscapedIdentifier(what string, escapeCharacter rune) (string, error) {
builder := strings.Builder{}
for s.index < len(s.Symbol) {
ch := s.current()
if ch == escapeCharacter {
s.index++
if s.index >= len(s.Symbol) {
break
}
if s.current() == escapeCharacter {
// Escaped space character.
builder.WriteRune(ch)
} else {
return builder.String(), nil
}
} else {
builder.WriteRune(ch)
}
s.index++
}
return "", s.error(fmt.Sprintf("reached end of symbol while parsing <%s>, expected a '%v' character", what, string(escapeCharacter)))
}

func (s *symbolParser) acceptCharacter(r rune, what string) error {
if s.current() == r {
s.index++
return nil
}
return s.error(fmt.Sprintf("expected '%v', obtained '%v', while parsing %v", string(r), string(s.current()), what))
}
Loading

0 comments on commit c269d2b

Please sign in to comment.