Skip to content

Commit

Permalink
Refactor Oniguruma integration
Browse files Browse the repository at this point in the history
Instead of use a command to change imports before build, using a build tag to generate the correct binary.

This will allow applications to compile enry using oniguruma with less troubles.

Signed-off-by: Antonio Jesus Navarro Perez <[email protected]>
  • Loading branch information
ajnavarro authored and dennwc committed Aug 29, 2018
1 parent 8da8516 commit 15bb131
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 52 deletions.
6 changes: 2 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,7 @@ install:
- mkdir -p $GOPATH/src/gopkg.in/src-d
- ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1
- cd $GOPATH/src/gopkg.in/src-d/enry.v1
- if [ "$ONIGURUMA" == "1" ]; then make oniguruma; fi
- go get -v -t ./...

- if [ "$ONIGURUMA" == "1" ]; then tags="$tags oniguruma"; fi; go get -v -t --tags "$tags" ./...
script:
- make test-coverage

Expand Down Expand Up @@ -100,7 +98,7 @@ jobs:
- sudo apt-get update
- sudo apt-get install -y --no-install-recommends clang g++ gcc gcc-multilib libc6-dev libc6-dev-i386 mingw-w64 patch xz-utils
- cd ${HOME}
- curl -sSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf -
- curl -sSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf -
- cd $GOPATH/src/gopkg.in/src-d/enry.v1

script:
Expand Down
15 changes: 0 additions & 15 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,6 @@ DARWIN_SHARED_LIB=$(DARWIN_DIR)/libenry.dylib
HEADER_FILE=libenry.h
NATIVE_LIB=./shared/enry.go

# source files to be patched for using "rubex" instead of "regexp"
RUBEX_PATCHED := internal/code-generator/generator/heuristics.go internal/tokenizer/tokenize.go common.go
RUBEX_ORIG := $(RUBEX_PATCHED:=.orig)

.PHONY: revert-oniguruma

$(LINGUIST_PATH):
git clone https://github.com/github/linguist.git $@

Expand All @@ -69,15 +63,6 @@ benchmarks-slow: $(LINGUST_PATH)
mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \
benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench

$(RUBEX_ORIG): %.orig : %
sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' $<
@touch $@

oniguruma: $(RUBEX_ORIG)

revert-oniguruma:
@for file in $(RUBEX_PATCHED); do if [ -e "$$file.orig" ]; then mv "$$file.orig" "$$file" && echo mv "$$file.orig" "$$file"; fi; done

build-cli:
go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cmd/enry/main.go

Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ On Ubuntu, it is
sudo apt install libonig-dev
```

To build enry with Oniguruma regexps, patch the imports with
To build enry with Oniguruma regexps use the `oniguruma` build tag

```
make oniguruma
go get -v -t --tags oniguruma ./...
```

and then rebuild the project.
Expand Down Expand Up @@ -162,7 +162,7 @@ We update enry when changes are done in linguist's master branch on the followin
* [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml)
* [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml)

Currently we don't have any procedure established to automatically detect changes in the linguist project and regenerate the code.
Currently we don't have any procedure established to automatically detect changes in the linguist project and regenerate the code.
So we update the generated code as needed, without any specific criteria.

If you want to update *enry* because of changes in linguist, you can run the *go
Expand Down Expand Up @@ -217,7 +217,7 @@ If you want to reproduce the same benchmarks you can run:

benchmarks/run.sh

from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram).
from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram).

This can take some time, so to run local benchmarks for a quick check you can either:

Expand Down
14 changes: 7 additions & 7 deletions common.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ import (
"bufio"
"bytes"
"path/filepath"
"regexp"
"strings"

"gopkg.in/src-d/enry.v1/data"
"gopkg.in/src-d/enry.v1/regex"
)

// OtherLanguage is used as a zero value when a function can not return a specific language.
Expand Down Expand Up @@ -197,10 +197,10 @@ func footScope(content []byte, scope int) (index int) {
}

var (
reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
reEmacsLang = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
reVimModeline = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
reVimLang = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
)

// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
Expand Down Expand Up @@ -283,8 +283,8 @@ func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []st
}

var (
shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`)
pythonVersion = regexp.MustCompile(`python\d\.\d+`)
shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
pythonVersion = regex.MustCompile(`python\d\.\d+`)
)

func getInterpreter(data []byte) (interpreter string) {
Expand Down
7 changes: 4 additions & 3 deletions internal/code-generator/generator/heuristics.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@ import (
"fmt"
"io"
"io/ioutil"
"regexp"
"strconv"
"strings"
"text/template"

"gopkg.in/src-d/enry.v1/regex"
)

// Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature.
Expand Down Expand Up @@ -38,7 +39,7 @@ const (
)

var (
disambLine = regexp.MustCompile(`^(\s*)disambiguate`)
disambLine = regex.MustCompile(`^(\s*)disambiguate`)
definedRegs = make(map[string]string)
illegalCharacter = map[string]string{
"#": "Sharp",
Expand Down Expand Up @@ -378,7 +379,7 @@ func convertToValidRegexp(reg string) string {
func includeToRegExp(include string) string {
content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)]
content = strings.Trim(content, `"'`)
return regexp.QuoteMeta(content)
return regex.QuoteMeta(content)
}

func getLanguages(line string) []string {
Expand Down
39 changes: 20 additions & 19 deletions internal/tokenizer/tokenize.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ package tokenizer

import (
"bytes"
"regexp"

"gopkg.in/src-d/enry.v1/regex"
)

const byteLimit = 100000
Expand Down Expand Up @@ -72,20 +73,20 @@ var (
//
// These regexps were converted to work in the same way for both engines:
//
reLiteralStringQuotes = regexp.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
reMultilineComment = regexp.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
reShebang = regexp.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
reSGML = regexp.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
reSGMLComment = regexp.MustCompile(`(<!--(.|\n)*?-->)`)
reSGMLAttributes = regexp.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
reSGMLLoneAttribute = regexp.MustCompile(`([0-9A-Za-z_]+)`)
reRegularToken = regexp.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)

regexToSkip = []*regexp.Regexp{
reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
reSingleLineComment = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
reMultilineComment = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
rePunctuation = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
reSGMLComment = regex.MustCompile(`(<!--(.|\n)*?-->)`)
reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`)
reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
reOperators = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)

regexToSkip = []regex.EnryRegexp{
// The order must be this
reLiteralStringQuotes,
reMultilineComment,
Expand Down Expand Up @@ -124,22 +125,22 @@ func getShebangToken(matchedShebang [][]byte) []byte {
return tokenShebang
}

func commonExtracAndReplace(content []byte, re *regexp.Regexp) ([]byte, [][]byte) {
func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
tokens := re.FindAll(content, -1)
content = re.ReplaceAll(content, []byte(` `))
return content, tokens
}

func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
return commonExtracAndReplace(content, rePunctuation)
return commonExtractAndReplace(content, rePunctuation)
}

func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
return commonExtracAndReplace(content, reRegularToken)
return commonExtractAndReplace(content, reRegularToken)
}

func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
return commonExtracAndReplace(content, reOperators)
return commonExtractAndReplace(content, reOperators)
}

func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
Expand Down
17 changes: 17 additions & 0 deletions regex/oniguruma.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// +build oniguruma

package regex

import (
"github.com/moovweb/rubex"
)

type EnryRegexp = *rubex.Regexp

func MustCompile(str string) EnryRegexp {
return rubex.MustCompile(str)
}

func QuoteMeta(s string) string {
return rubex.QuoteMeta(s)
}
17 changes: 17 additions & 0 deletions regex/standard.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// +build !oniguruma

package regex

import (
"regexp"
)

type EnryRegexp = *regexp.Regexp

func MustCompile(str string) EnryRegexp {
return regexp.MustCompile(str)
}

func QuoteMeta(s string) string {
return regexp.QuoteMeta(s)
}

0 comments on commit 15bb131

Please sign in to comment.