Skip to content

Commit

Permalink
fix: add c/cpp detect rule and tests for detector/tokenizer (#23)
Browse files Browse the repository at this point in the history
  • Loading branch information
blurfx authored Nov 25, 2023
1 parent 62ed238 commit 0d30d04
Show file tree
Hide file tree
Showing 5 changed files with 228 additions and 7 deletions.
2 changes: 2 additions & 0 deletions packages/core/src/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ export enum Language {
TypeScript = 'typescript',
JavaScript = 'javascript',
Golang = 'golang',
C = 'c',
Cpp = 'cpp',
}
26 changes: 20 additions & 6 deletions packages/core/src/detect.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,33 @@
import { Language } from './constants.ts';

const keywordWeights: Record<Language, Array<[RegExp, number]>> = {
javascript: [
[Language.JavaScript]: [
[/console\./g, 100],
[
/\b(console|await|async|function|export|import|this|class|for|let|const|map|join|require)\b/g,
/\b(await|async|function|export|import|this|class|for|let|const|map|join|require)\b/g,
10,
],
],
typescript: [
[Language.TypeScript]: [
[/console\./g, 100],
[
/\b(console|await|async|function|export|import|this|class|for|let|infer|const|map|join|require|type|implements|interface|declare|namespace|unknown|bigint|any|void|number|boolean|string|object|never|enum|unique symbol|symbol)\b/g,
/\b(await|async|function|export|import|this|class|for|let|infer|const|map|join|require|type|implements|interface|declare|namespace|unknown|bigint|any|void|number|boolean|string|object|never|enum|unique symbol|symbol)\b/g,
10,
],
],
golang: [[/\b(defer|go|chan|fmt|select|package)\b/g, 100]],
[Language.Golang]: [[/\b(defer|go|chan|fmt|select|package)\b/g, 100]],
[Language.C]: [
[/#include\b|printf\s*\(|scanf\s*\(/g, 100],
[/\b(union|typedef|struct|register|volatile|goto|sizeof)\b/g, 10],
],
[Language.Cpp]: [
[/#include\b|printf\s*\(|scanf\s*\(/g, 100],
[
/\b(cin|cout|template|dynamic_cast|static_cast|reinterpret_cast|const_cast|typeid|nullptr|constexpr|decltype|static_assert|noexcept|thread_local|alignas|alignof)\b/g,
100,
],
[/\b(union|typedef|struct|register|virtual|volatile|goto|sizeof)\b/g, 10],
],
};

export const detectLanguage = (code: string): string => {
Expand All @@ -22,7 +36,7 @@ export const detectLanguage = (code: string): string => {
scores[lang] = 0;
for (const [pattern, weight] of patterns) {
const matches = [...code.matchAll(pattern)];
if (matches) {
if (matches.length > 0) {
scores[lang] += matches.length * weight;
}
}
Expand Down
5 changes: 4 additions & 1 deletion packages/core/src/rules/cpp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,16 @@ const cppRules: ParseRule[] = [
},
{
kind: 'operator',
// pattern: /[+\-*/%&|^!~=<>?:]+|::|\(\)?|\[\]?|\{\}?/g,
pattern: /[+\-*/%&~|^!=<>?:]+/g,
},
{
kind: 'class',
pattern: /\b[A-Z_][\w_]*\b/g,
},
{
kind: 'function',
pattern: /[a-zA-Z_][\w_]*(?=\s*\()/g,
},
{
kind: 'symbol',
pattern: /[a-zA-Z_]\w*/g,
Expand Down
75 changes: 75 additions & 0 deletions packages/core/tests/detect/c-cpp.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import { describe, it, assert } from 'vitest';
import { detectLanguage } from '../../src';

// c language detection
describe('c/cpp language detection', () => {
it('case 1', () => {
const code = `
#include <stdio.h>
int main() {
printf("hello world");
return 0;
}
`;
const lang = detectLanguage(code);
assert.include(['c', 'cpp'], lang);
});

it('case 2', () => {
const code = `
int main() {
printf("hello world");
}
`;
const lang = detectLanguage(code);
assert.include(['c', 'cpp'], lang);
});

it('case 3', () => {
const code = `
int foo() {
int n;
scanf("%d", &n);
}
`;
const lang = detectLanguage(code);
assert.include(['c', 'cpp'], lang);
});

it('case 4', () => {
const code = `
#include <iostream>
template <typename T>
T add(T a, T b) {
return a + b;
}
int main() {
std::cout << add<int>(10, 20);
return 0;
}
`;
const lang = detectLanguage(code);
assert.equal('cpp', lang);
});

it('case 5', () => {
const code = `
#include <iostream>
class Base { virtual void dummy() {} };
class Derived: public Base { int a; };
int main () {
Base * b = new Base;
Derived * d = dynamic_cast<Derived*>(b);
if (d==nullptr) std::cout << "null";
else std::cout << "not null";
return 0;
}
`;
const lang = detectLanguage(code);
assert.equal('cpp', lang);
});
});
127 changes: 127 additions & 0 deletions packages/core/tests/tokenizer/c-cpp.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import { describe, it, expect } from 'vitest';
import { tokenize } from '../../src/tokenizer';
import cppRules from '../../src/rules/cpp';

describe('c/cpp tokenizer', () => {
it('can tokenize comment', () => {
const tokens = tokenize(
`
// this is comment
const str = "// not comment";
/*
this is multiline comment
*/
`,
cppRules,
);
const commentTokens = tokens.filter((token) => token.kind === 'comment');
expect(commentTokens.length).toBe(2);
expect(
commentTokens.find((token) => token.value.includes('this is comment')),
).not.toBeFalsy();
expect(
commentTokens.find((token) =>
token.value.includes('this is multiline comment'),
),
).not.toBeFalsy();
});

it('can tokenize keyword', () => {
const tokens = tokenize(
`
int main() {
int num = 10;
return 0;
}
`,
cppRules,
);
const keywords = ['int', 'return'];
keywords.forEach((keyword) => {
expect(
tokens.find(
(token) => token.value === keyword && token.kind === 'keyword',
),
).not.toBeFalsy();
});
});

it('can tokenize number', () => {
const tokens = tokenize(
`
int num = 123;
`,
cppRules,
);
const numberTokens = tokens.filter((token) => token.kind === 'number');
expect(numberTokens.find((token) => token.value === '123')).not.toBeFalsy();
});

it('can tokenize operator', () => {
const tokens = tokenize(
`
int num = 1 + 2 - 3 * 4 / 5 % 6;
`,
cppRules,
);
const operators = ['=', '+', '-', '*', '/', '%'];
const operatorTokens = tokens.filter((token) => token.kind === 'operator');
operators.forEach((operator) => {
expect(
operatorTokens.find((token) => token.value === operator),
).not.toBeFalsy();
});
});

it('can tokenize function', () => {
const tokens = tokenize(
`
int main() {
// main
}
`,
cppRules,
);
const functionNames = ['main'];
const functionTokens = tokens.filter((token) => token.kind === 'function');
expect(functionTokens.length).toBe(functionNames.length);
functionNames.forEach((name) => {
expect(
functionTokens.find((token) => token.value === name),
).not.toBeFalsy();
});
});

it('can tokenize class', () => {
const tokens = tokenize(
`
class MyClass {
public:
void greet() {
std::cout << "Hello, World!";
}
};
`,
cppRules,
);
const classTokens = tokens.filter((token) => token.kind === 'class');
expect(classTokens.length).toBe(1);
expect(classTokens[0].value).toBe('MyClass');
});

it('can tokenize string', () => {
const tokens = tokenize(
`
#include <iostream>
int main() {
std::cout << "Hello, World!";
return 0;
}
`,
cppRules,
);
const stringTokens = tokens.filter((token) => token.kind === 'string');
expect(stringTokens.length).toBe(1);
expect(stringTokens[0].value).toBe('"Hello, World!"');
});
});

0 comments on commit 0d30d04

Please sign in to comment.