Skip to content

Commit

Permalink
tokenizer: cleanup & attributions
Browse files Browse the repository at this point in the history
Signed-off-by: Alexander Bezzubov <[email protected]>
  • Loading branch information
bzz committed Apr 8, 2019
1 parent d45cddf commit 682f026
Show file tree
Hide file tree
Showing 4 changed files with 4 additions and 58 deletions.
11 changes: 3 additions & 8 deletions internal/tokenizer/flex/linguist.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// https://github.com/github/linguist/blob/f72f2a21dfe80ebd16af3bc6216da75cd983a4f6/ext/linguist/linguist.h#L1
enum tokenizer_type {
NO_ACTION,
REGULAR_TOKEN,
Expand All @@ -10,11 +11,5 @@ struct tokenizer_extra {
enum tokenizer_type type;
};

// #include <stddef.h>

// #ifdef __APPLE__
// char *strndup(const char *s1, size_t n);
// #elif defined(_WIN32) || defined(_WIN64)
// char *strndup(const char *s1, size_t n);
// #pragma warning (disable: 4244)
// #endif // _WIN32 || _WIN64
// TODO(bzz) port Win support from
// https://github.com/github/linguist/commit/8e912b4d8bf2aef7948de59eba48b75cfcbc97e0
20 changes: 1 addition & 19 deletions internal/tokenizer/flex/tokenize_c.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,15 @@ package flex
import "C"
import "unsafe"

// TokenizeC is only calling a C-flex based tokenizer from linguist
func TokenizeC(content []byte) []string {
cs := C.CBytes(content)
defer C.free(unsafe.Pointer(cs))
// C.tokenizer_extract_tokens((*C.char)(cs))
return nil
}

const maxTokenLen = 32


// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
// This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12
func TokenizeFlex(content []byte) []string {
var buf C.YY_BUFFER_STATE
var scanner C.yyscan_t
var extra C.struct_tokenizer_extra
// var scanner *C.yyscan_t = (*C.yyscan_t)(C.malloc(C.sizeof_yyscan_t))
// var extra *C.struct_tokenizer_extra = (*C.struct_tokenizer_extra)(C.malloc(C.sizeof_struct_tokenizer_extra))
var _len C.ulong
var r C.int

Expand All @@ -50,7 +41,6 @@ func TokenizeFlex(content []byte) []string {
_len = C.strlen(extra.token)
if (_len <= maxTokenLen) {
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
//rb_ary_push(ary, rb_str_new(extra.token, len))
}
C.free(unsafe.Pointer(extra.token))
break
Expand All @@ -59,9 +49,6 @@ func TokenizeFlex(content []byte) []string {
if (_len <= maxTokenLen) {
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
ary = append(ary, s)
//s = rb_str_new2("SHEBANG#!");
//rb_str_cat(s, extra.token, len);
//rb_ary_push(ary, s);
}
C.free(unsafe.Pointer(extra.token))
break
Expand All @@ -70,9 +57,6 @@ func TokenizeFlex(content []byte) []string {
if (_len <= maxTokenLen) {
s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
ary = append(ary, s)
//s = rb_str_new(extra.token, len);
//rb_str_cat2(s, ">");
//rb_ary_push(ary, s);
}
C.free(unsafe.Pointer(extra.token))
break
Expand All @@ -84,8 +68,6 @@ func TokenizeFlex(content []byte) []string {

C.linguist_yy_delete_buffer(buf, scanner)
C.linguist_yylex_destroy(scanner)
// C.free(unsafe.Pointer(extra))
// C.free(unsafe.Pointer(scanner))

return ary
}
25 changes: 0 additions & 25 deletions internal/tokenizer/flex/tokenize_c_test.go

This file was deleted.

6 changes: 0 additions & 6 deletions internal/tokenizer/tokenize_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,3 @@ func BenchmarkTokenizer(b *testing.B) {
}
}
}

//TODO(bzz): introduce tokenizer benchmark suit
// baseline - just read the files
// RE2
// oniguruma
// cgo to flex-based impl

0 comments on commit 682f026

Please sign in to comment.