From 15bb13117fb1cc607bf87046d41ac7b53084c18b Mon Sep 17 00:00:00 2001 From: Antonio Jesus Navarro Perez Date: Tue, 28 Aug 2018 17:27:18 +0200 Subject: [PATCH] Refactor Oniguruma integration Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez --- .travis.yml | 6 +-- Makefile | 15 ------- README.md | 8 ++-- common.go | 14 +++---- .../code-generator/generator/heuristics.go | 7 ++-- internal/tokenizer/tokenize.go | 39 ++++++++++--------- regex/oniguruma.go | 17 ++++++++ regex/standard.go | 17 ++++++++ 8 files changed, 71 insertions(+), 52 deletions(-) create mode 100644 regex/oniguruma.go create mode 100644 regex/standard.go diff --git a/.travis.yml b/.travis.yml index bb734c4d..5cd010f4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,9 +24,7 @@ install: - mkdir -p $GOPATH/src/gopkg.in/src-d - ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1 - cd $GOPATH/src/gopkg.in/src-d/enry.v1 - - if [ "$ONIGURUMA" == "1" ]; then make oniguruma; fi - - go get -v -t ./... - + - if [ "$ONIGURUMA" == "1" ]; then tags="$tags oniguruma"; fi; go get -v -t --tags "$tags" ./... script: - make test-coverage @@ -100,7 +98,7 @@ jobs: - sudo apt-get update - sudo apt-get install -y --no-install-recommends clang g++ gcc gcc-multilib libc6-dev libc6-dev-i386 mingw-w64 patch xz-utils - cd ${HOME} - - curl -sSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf - + - curl -sSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf - - cd $GOPATH/src/gopkg.in/src-d/enry.v1 script: diff --git a/Makefile b/Makefile index 3f94e8c6..335cd2c7 100644 --- a/Makefile +++ b/Makefile @@ -38,12 +38,6 @@ DARWIN_SHARED_LIB=$(DARWIN_DIR)/libenry.dylib HEADER_FILE=libenry.h NATIVE_LIB=./shared/enry.go -# source files to be patched for using "rubex" instead of "regexp" -RUBEX_PATCHED := internal/code-generator/generator/heuristics.go internal/tokenizer/tokenize.go common.go -RUBEX_ORIG := $(RUBEX_PATCHED:=.orig) - -.PHONY: revert-oniguruma - $(LINGUIST_PATH): git clone https://github.com/github/linguist.git $@ @@ -69,15 +63,6 @@ benchmarks-slow: $(LINGUST_PATH) mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \ benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench -$(RUBEX_ORIG): %.orig : % - sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' $< - @touch $@ - -oniguruma: $(RUBEX_ORIG) - -revert-oniguruma: - @for file in $(RUBEX_PATCHED); do if [ -e "$$file.orig" ]; then mv "$$file.orig" "$$file" && echo mv "$$file.orig" "$$file"; fi; done - build-cli: go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cmd/enry/main.go diff --git a/README.md b/README.md index 0da8228a..38cac537 100644 --- a/README.md +++ b/README.md @@ -37,10 +37,10 @@ On Ubuntu, it is sudo apt install libonig-dev ``` -To build enry with Oniguruma regexps, patch the imports with +To build enry with Oniguruma regexps use the `oniguruma` build tag ``` -make oniguruma +go get -v -t --tags oniguruma ./... ``` and then rebuild the project. @@ -162,7 +162,7 @@ We update enry when changes are done in linguist's master branch on the followin * [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml) * [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml) -Currently we don't have any procedure established to automatically detect changes in the linguist project and regenerate the code. +Currently we don't have any procedure established to automatically detect changes in the linguist project and regenerate the code. So we update the generated code as needed, without any specific criteria. If you want to update *enry* because of changes in linguist, you can run the *go @@ -217,7 +217,7 @@ If you want to reproduce the same benchmarks you can run: benchmarks/run.sh -from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram). +from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram). This can take some time, so to run local benchmarks for a quick check you can either: diff --git a/common.go b/common.go index a0dd6664..c2f06364 100644 --- a/common.go +++ b/common.go @@ -4,10 +4,10 @@ import ( "bufio" "bytes" "path/filepath" - "regexp" "strings" "gopkg.in/src-d/enry.v1/data" + "gopkg.in/src-d/enry.v1/regex" ) // OtherLanguage is used as a zero value when a function can not return a specific language. @@ -197,10 +197,10 @@ func footScope(content []byte, scope int) (index int) { } var ( - reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`) - reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`) - reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`) - reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`) + reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`) + reEmacsLang = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`) + reVimModeline = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`) + reVimLang = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`) ) // GetLanguagesByEmacsModeline returns a slice of possible languages for the given content. @@ -283,8 +283,8 @@ func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []st } var ( - shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`) - pythonVersion = regexp.MustCompile(`python\d\.\d+`) + shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`) + pythonVersion = regex.MustCompile(`python\d\.\d+`) ) func getInterpreter(data []byte) (interpreter string) { diff --git a/internal/code-generator/generator/heuristics.go b/internal/code-generator/generator/heuristics.go index 212aafd6..ebd17444 100644 --- a/internal/code-generator/generator/heuristics.go +++ b/internal/code-generator/generator/heuristics.go @@ -6,10 +6,11 @@ import ( "fmt" "io" "io/ioutil" - "regexp" "strconv" "strings" "text/template" + + "gopkg.in/src-d/enry.v1/regex" ) // Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature. @@ -38,7 +39,7 @@ const ( ) var ( - disambLine = regexp.MustCompile(`^(\s*)disambiguate`) + disambLine = regex.MustCompile(`^(\s*)disambiguate`) definedRegs = make(map[string]string) illegalCharacter = map[string]string{ "#": "Sharp", @@ -378,7 +379,7 @@ func convertToValidRegexp(reg string) string { func includeToRegExp(include string) string { content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)] content = strings.Trim(content, `"'`) - return regexp.QuoteMeta(content) + return regex.QuoteMeta(content) } func getLanguages(line string) []string { diff --git a/internal/tokenizer/tokenize.go b/internal/tokenizer/tokenize.go index ce02e310..a836d799 100644 --- a/internal/tokenizer/tokenize.go +++ b/internal/tokenizer/tokenize.go @@ -2,7 +2,8 @@ package tokenizer import ( "bytes" - "regexp" + + "gopkg.in/src-d/enry.v1/regex" ) const byteLimit = 100000 @@ -72,20 +73,20 @@ var ( // // These regexps were converted to work in the same way for both engines: // - reLiteralStringQuotes = regexp.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`) - reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`) - reMultilineComment = regexp.MustCompile(`(/\*(.|\n)*?\*/||\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`) - reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) - reShebang = regexp.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`) - rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`) - reSGML = regexp.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`) - reSGMLComment = regexp.MustCompile(`()`) - reSGMLAttributes = regexp.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`) - reSGMLLoneAttribute = regexp.MustCompile(`([0-9A-Za-z_]+)`) - reRegularToken = regexp.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`) - reOperators = regexp.MustCompile(`<|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`) + reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) + reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`) + rePunctuation = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`) + reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`) + reSGMLComment = regex.MustCompile(`()`) + reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`) + reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`) + reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`) + reOperators = regex.MustCompile(`<