Skip to content

Commit

Permalink
Rewrite GetLanguages to work like Linguist.detect
Browse files Browse the repository at this point in the history
Prior to this change, GetLanguages collected all candidate languages from each
strategy to pass to the next strategy (without de-duplicating them). Linguist
only uses the previous strategy's candidates for the next strategy. Also, it
would overwrite languages with nil if a strategy returned that, so you could get
into a situation where you go from multiple languages to no language.

See the Ruby code for details: https://github.com/github/linguist/blob/aad49acc0624c70d654a8dce447887dbbc713c7a/lib/linguist.rb#L14-L49

This addresses src-d/enry#207 because GetLanguages
should not return all candidates detected, otherwise it would work differently
than Linguist.
look committed Apr 13, 2021
1 parent 7f5d84a commit bf7167f
Showing 2 changed files with 49 additions and 7 deletions.
24 changes: 17 additions & 7 deletions common.go
Original file line number Diff line number Diff line change
@@ -118,7 +118,13 @@ func getLanguageBySpecificClassifier(content []byte, candidates []string, classi
}

// GetLanguages applies a sequence of strategies based on the given filename and content
// to find out the most probably languages to return.
// to find out the most probable languages to return.
//
// If it finds a strategy that produces a single result, it will be returned;
// otherise the last strategy that returned multiple results will be returned.
// If the content is binary, no results will be returned. This matches the
// behavior of Linguist.detect: https://github.com/github/linguist/blob/aad49acc0624c70d654a8dce447887dbbc713c7a/lib/linguist.rb#L14-L49
//
// At least one of arguments should be set. If content is missing, language detection will be based on the filename.
// The function won't read the file, given an empty content.
func GetLanguages(filename string, content []byte) []string {
@@ -127,16 +133,20 @@ func GetLanguages(filename string, content []byte) []string {
}

var languages []string
candidates := []string{}
for _, strategy := range DefaultStrategies {
languages = strategy(filename, content, candidates)
if len(languages) == 1 {
return languages
candidates := strategy(filename, content, languages)
// No candidates, continue to next strategy without updating languages
if len(candidates) == 0 {
continue
}

if len(languages) > 0 {
candidates = append(candidates, languages...)
// Only one candidate match, return it
if len(candidates) == 1 {
return candidates
}

// Save the candidates from this strategy to pass onto to the next strategy, like Linguist
languages = candidates
}

return languages
32 changes: 32 additions & 0 deletions common_test.go
Original file line number Diff line number Diff line change
@@ -119,6 +119,38 @@ func (s *EnryTestSuite) TestGetLanguage() {
}
}

func (s *EnryTestSuite) TestGetLanguages() {
tests := []struct {
name string
filename string
content []byte
expected []string
}{
// With no content or filename, no language can be detected
{name: "TestGetLanguages_0", filename: "", content: []byte{}, expected: nil},
// The strategy that will match is GetLanguagesByExtension. Lacking content, it will return those results.
{name: "TestGetLanguages_1", filename: "foo.h", content: []byte{}, expected: []string{"C", "C++", "Objective-C"}},
// GetLanguagesByExtension will return an unambiguous match when there is a single result.
{name: "TestGetLanguages_2", filename: "foo.groovy", content: []byte{}, expected: []string{"Groovy"}},
// GetLanguagesByExtension will return "Rust", "RenderScript" for .rs,
// then GetLanguagesByContent will take the first rule that matches (in this case Rust)
{name: "TestGetLanguages_3", filename: "foo.rs", content: []byte("use \n#include"), expected: []string{"Rust"}},
// .. and in this case, RenderScript (no content that matches a Rust regex can be included, because it runs first.)
{name: "TestGetLanguages_4", filename: "foo.rs", content: []byte("#include"), expected: []string{"RenderScript"}},
// GetLanguagesByExtension will return "AMPL", "Linux Kernel Module", "Modula-2", "XML",
// then GetLanguagesByContent will ALWAYS return Linux Kernel Module and AMPL when there is no content,
// and no further classifier can do anything without content
{name: "TestGetLanguages_5", filename: "foo.mod", content: []byte{}, expected: []string{"Linux Kernel Module", "AMPL"}},
// ...with some AMPL tokens, the DefaultClassifier will pick AMPL as the most likely language.
{name: "TestGetLanguages_6", filename: "foo.mod", content: []byte("BEAMS ROWS - TotalWeight"), expected: []string{"AMPL", "Linux Kernel Module"}},
}

for _, test := range tests {
languages := GetLanguages(test.filename, test.content)
assert.Equal(s.T(), test.expected, languages, fmt.Sprintf("%v: %v, expected: %v", test.name, languages, test.expected))
}
}

func (s *EnryTestSuite) TestGetLanguagesByModelineLinguist() {
var modelinesDir = filepath.Join(s.tmpLinguist, "test", "fixtures", "Data", "Modelines")

0 comments on commit bf7167f

Please sign in to comment.