From 0d30d047715729da6035065e29fbe09db7cec6dd Mon Sep 17 00:00:00 2001 From: changhui lee Date: Sun, 26 Nov 2023 03:21:16 +0900 Subject: [PATCH] fix: add c/cpp detect rule and tests for detector/tokenizer (#23) --- packages/core/src/constants.ts | 2 + packages/core/src/detect.ts | 26 +++- packages/core/src/rules/cpp.ts | 5 +- packages/core/tests/detect/c-cpp.test.ts | 75 ++++++++++++ packages/core/tests/tokenizer/c-cpp.test.ts | 127 ++++++++++++++++++++ 5 files changed, 228 insertions(+), 7 deletions(-) create mode 100644 packages/core/tests/detect/c-cpp.test.ts create mode 100644 packages/core/tests/tokenizer/c-cpp.test.ts diff --git a/packages/core/src/constants.ts b/packages/core/src/constants.ts index 95f0f9f..063fc23 100644 --- a/packages/core/src/constants.ts +++ b/packages/core/src/constants.ts @@ -2,4 +2,6 @@ export enum Language { TypeScript = 'typescript', JavaScript = 'javascript', Golang = 'golang', + C = 'c', + Cpp = 'cpp', } diff --git a/packages/core/src/detect.ts b/packages/core/src/detect.ts index fafce0c..c27a1a9 100644 --- a/packages/core/src/detect.ts +++ b/packages/core/src/detect.ts @@ -1,19 +1,33 @@ import { Language } from './constants.ts'; const keywordWeights: Record> = { - javascript: [ + [Language.JavaScript]: [ + [/console\./g, 100], [ - /\b(console|await|async|function|export|import|this|class|for|let|const|map|join|require)\b/g, + /\b(await|async|function|export|import|this|class|for|let|const|map|join|require)\b/g, 10, ], ], - typescript: [ + [Language.TypeScript]: [ + [/console\./g, 100], [ - /\b(console|await|async|function|export|import|this|class|for|let|infer|const|map|join|require|type|implements|interface|declare|namespace|unknown|bigint|any|void|number|boolean|string|object|never|enum|unique symbol|symbol)\b/g, + /\b(await|async|function|export|import|this|class|for|let|infer|const|map|join|require|type|implements|interface|declare|namespace|unknown|bigint|any|void|number|boolean|string|object|never|enum|unique symbol|symbol)\b/g, 10, ], ], - golang: [[/\b(defer|go|chan|fmt|select|package)\b/g, 100]], + [Language.Golang]: [[/\b(defer|go|chan|fmt|select|package)\b/g, 100]], + [Language.C]: [ + [/#include\b|printf\s*\(|scanf\s*\(/g, 100], + [/\b(union|typedef|struct|register|volatile|goto|sizeof)\b/g, 10], + ], + [Language.Cpp]: [ + [/#include\b|printf\s*\(|scanf\s*\(/g, 100], + [ + /\b(cin|cout|template|dynamic_cast|static_cast|reinterpret_cast|const_cast|typeid|nullptr|constexpr|decltype|static_assert|noexcept|thread_local|alignas|alignof)\b/g, + 100, + ], + [/\b(union|typedef|struct|register|virtual|volatile|goto|sizeof)\b/g, 10], + ], }; export const detectLanguage = (code: string): string => { @@ -22,7 +36,7 @@ export const detectLanguage = (code: string): string => { scores[lang] = 0; for (const [pattern, weight] of patterns) { const matches = [...code.matchAll(pattern)]; - if (matches) { + if (matches.length > 0) { scores[lang] += matches.length * weight; } } diff --git a/packages/core/src/rules/cpp.ts b/packages/core/src/rules/cpp.ts index 3e42eae..be4f18b 100644 --- a/packages/core/src/rules/cpp.ts +++ b/packages/core/src/rules/cpp.ts @@ -25,13 +25,16 @@ const cppRules: ParseRule[] = [ }, { kind: 'operator', - // pattern: /[+\-*/%&|^!~=<>?:]+|::|\(\)?|\[\]?|\{\}?/g, pattern: /[+\-*/%&~|^!=<>?:]+/g, }, { kind: 'class', pattern: /\b[A-Z_][\w_]*\b/g, }, + { + kind: 'function', + pattern: /[a-zA-Z_][\w_]*(?=\s*\()/g, + }, { kind: 'symbol', pattern: /[a-zA-Z_]\w*/g, diff --git a/packages/core/tests/detect/c-cpp.test.ts b/packages/core/tests/detect/c-cpp.test.ts new file mode 100644 index 0000000..e1eb58c --- /dev/null +++ b/packages/core/tests/detect/c-cpp.test.ts @@ -0,0 +1,75 @@ +import { describe, it, assert } from 'vitest'; +import { detectLanguage } from '../../src'; + +// c language detection +describe('c/cpp language detection', () => { + it('case 1', () => { + const code = ` + #include + int main() { + printf("hello world"); + return 0; + } + `; + const lang = detectLanguage(code); + assert.include(['c', 'cpp'], lang); + }); + + it('case 2', () => { + const code = ` + int main() { + printf("hello world"); + } + `; + const lang = detectLanguage(code); + assert.include(['c', 'cpp'], lang); + }); + + it('case 3', () => { + const code = ` + int foo() { + int n; + scanf("%d", &n); + } + `; + const lang = detectLanguage(code); + assert.include(['c', 'cpp'], lang); + }); + + it('case 4', () => { + const code = ` + #include + + template + T add(T a, T b) { + return a + b; + } + + int main() { + std::cout << add(10, 20); + return 0; + } + `; + const lang = detectLanguage(code); + assert.equal('cpp', lang); + }); + + it('case 5', () => { + const code = ` + #include + + class Base { virtual void dummy() {} }; + class Derived: public Base { int a; }; + + int main () { + Base * b = new Base; + Derived * d = dynamic_cast(b); + if (d==nullptr) std::cout << "null"; + else std::cout << "not null"; + return 0; + } + `; + const lang = detectLanguage(code); + assert.equal('cpp', lang); + }); +}); diff --git a/packages/core/tests/tokenizer/c-cpp.test.ts b/packages/core/tests/tokenizer/c-cpp.test.ts new file mode 100644 index 0000000..41c9af5 --- /dev/null +++ b/packages/core/tests/tokenizer/c-cpp.test.ts @@ -0,0 +1,127 @@ +import { describe, it, expect } from 'vitest'; +import { tokenize } from '../../src/tokenizer'; +import cppRules from '../../src/rules/cpp'; + +describe('c/cpp tokenizer', () => { + it('can tokenize comment', () => { + const tokens = tokenize( + ` + // this is comment + const str = "// not comment"; + /* + this is multiline comment + */ + `, + cppRules, + ); + const commentTokens = tokens.filter((token) => token.kind === 'comment'); + expect(commentTokens.length).toBe(2); + expect( + commentTokens.find((token) => token.value.includes('this is comment')), + ).not.toBeFalsy(); + expect( + commentTokens.find((token) => + token.value.includes('this is multiline comment'), + ), + ).not.toBeFalsy(); + }); + + it('can tokenize keyword', () => { + const tokens = tokenize( + ` + int main() { + int num = 10; + return 0; + } + `, + cppRules, + ); + const keywords = ['int', 'return']; + keywords.forEach((keyword) => { + expect( + tokens.find( + (token) => token.value === keyword && token.kind === 'keyword', + ), + ).not.toBeFalsy(); + }); + }); + + it('can tokenize number', () => { + const tokens = tokenize( + ` + int num = 123; + `, + cppRules, + ); + const numberTokens = tokens.filter((token) => token.kind === 'number'); + expect(numberTokens.find((token) => token.value === '123')).not.toBeFalsy(); + }); + + it('can tokenize operator', () => { + const tokens = tokenize( + ` + int num = 1 + 2 - 3 * 4 / 5 % 6; + `, + cppRules, + ); + const operators = ['=', '+', '-', '*', '/', '%']; + const operatorTokens = tokens.filter((token) => token.kind === 'operator'); + operators.forEach((operator) => { + expect( + operatorTokens.find((token) => token.value === operator), + ).not.toBeFalsy(); + }); + }); + + it('can tokenize function', () => { + const tokens = tokenize( + ` + int main() { + // main + } + `, + cppRules, + ); + const functionNames = ['main']; + const functionTokens = tokens.filter((token) => token.kind === 'function'); + expect(functionTokens.length).toBe(functionNames.length); + functionNames.forEach((name) => { + expect( + functionTokens.find((token) => token.value === name), + ).not.toBeFalsy(); + }); + }); + + it('can tokenize class', () => { + const tokens = tokenize( + ` + class MyClass { + public: + void greet() { + std::cout << "Hello, World!"; + } + }; + `, + cppRules, + ); + const classTokens = tokens.filter((token) => token.kind === 'class'); + expect(classTokens.length).toBe(1); + expect(classTokens[0].value).toBe('MyClass'); + }); + + it('can tokenize string', () => { + const tokens = tokenize( + ` + #include + int main() { + std::cout << "Hello, World!"; + return 0; + } + `, + cppRules, + ); + const stringTokens = tokens.filter((token) => token.kind === 'string'); + expect(stringTokens.length).toBe(1); + expect(stringTokens[0].value).toBe('"Hello, World!"'); + }); +});