-
Notifications
You must be signed in to change notification settings - Fork 92
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Symbol search: improve performance for anchored queries #682
Changes from 4 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -239,6 +239,7 @@ func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map | |
type symbolSubstrMatchTree struct { | ||
*substrMatchTree | ||
|
||
anchored bool | ||
patternSize uint32 | ||
fileEndRunes []uint32 | ||
fileEndSymbol []uint32 | ||
|
@@ -292,12 +293,19 @@ func (t *symbolSubstrMatchTree) prepare(doc uint32) { | |
continue | ||
} | ||
|
||
if end <= sections[secIdx].End { | ||
t.current[0].symbol = true | ||
t.current[0].symbolIdx = uint32(secIdx) | ||
trimmed = append(trimmed, t.current[0]) | ||
if end > sections[secIdx].End { | ||
t.current = t.current[1:] | ||
continue | ||
} | ||
|
||
if t.anchored && !(start == sections[secIdx].Start && end == sections[secIdx].End) { | ||
t.current = t.current[1:] | ||
continue | ||
} | ||
|
||
t.current[0].symbol = true | ||
t.current[0].symbolIdx = uint32(secIdx) | ||
trimmed = append(trimmed, t.current[0]) | ||
camdencheek marked this conversation as resolved.
Show resolved
Hide resolved
|
||
t.current = t.current[1:] | ||
} | ||
t.current = trimmed | ||
|
@@ -983,14 +991,33 @@ func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) | |
optCopy := opt | ||
optCopy.DisableWordMatchOptimization = true | ||
|
||
subMT, err := d.newMatchTree(s.Expr, optCopy) | ||
anchored := false | ||
expr := s.Expr | ||
switch e := s.Expr.(type) { | ||
case *query.Regexp: | ||
pattern := e.Regexp.String() | ||
if strings.HasPrefix(pattern, "^") && strings.HasSuffix(pattern, "$") { | ||
camdencheek marked this conversation as resolved.
Show resolved
Hide resolved
|
||
pattern = pattern[1 : len(pattern)-1] | ||
parsedPattern, err := syntax.Parse(pattern, e.Regexp.Flags) | ||
if err != nil { | ||
return nil, err | ||
} | ||
camdencheek marked this conversation as resolved.
Show resolved
Hide resolved
|
||
eCopy := *e | ||
eCopy.Regexp = parsedPattern | ||
expr = &eCopy | ||
anchored = true | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When constructing the match tree for symbols, try to strip anchors off the input pattern. We keep track of whether the pattern was anchored, which we then use to post-filter the results so we only return ones that exactly match the symbol boundaries. |
||
|
||
subMT, err := d.newMatchTree(expr, optCopy) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I looked at putting this logic into |
||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
if substr, ok := subMT.(*substrMatchTree); ok { | ||
return &symbolSubstrMatchTree{ | ||
substrMatchTree: substr, | ||
anchored: anchored, | ||
patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), | ||
fileEndRunes: d.fileEndRunes, | ||
fileEndSymbol: d.fileEndSymbol, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the pattern was anchored, check that the match is not just contained in a symbol's range, but matches it exactly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
based on this code should the variable name be more something along the lines of
exact
? This is on substr not a regexp, I think anchored can be misinterpreted a bit. There are others parts in the zoekt API where we use the word exact to mean the string must match exactly.