Skip to content

Commit

Permalink
Update tokenizer playground (#717)
Browse files Browse the repository at this point in the history
* [tokenizer playground] Add Mistral v3 tokenizer

* Add support for custom tokenizers

* Add llama 3 tokenizer
  • Loading branch information
xenova authored May 10, 2024
1 parent 880cd3e commit bd31552
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 23 deletions.
16 changes: 8 additions & 8 deletions examples/tokenizer-playground/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion examples/tokenizer-playground/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"preview": "vite preview"
},
"dependencies": {
"@xenova/transformers": "^2.15.1",
"@xenova/transformers": "^2.17.1",
"react": "^18.2.0",
"react-dom": "^18.2.0"
},
Expand Down
62 changes: 48 additions & 14 deletions examples/tokenizer-playground/src/App.jsx
Original file line number Diff line number Diff line change
@@ -1,7 +1,24 @@
import { useCallback, useEffect, useRef, useState } from 'react'
import './App.css'
import { Token } from './components/Token'
import './App.css'

// Define list of tokenizers and their corresponding human-readable names
const TOKENIZER_OPTIONS = Object.freeze({
'Xenova/gpt-4': 'gpt-4 / gpt-3.5-turbo / text-embedding-ada-002',
'Xenova/text-davinci-003': 'text-davinci-003 / text-davinci-002',
'Xenova/gpt-3': 'gpt-3',
'Xenova/grok-1-tokenizer': 'Grok-1',
'Xenova/claude-tokenizer': 'Claude',
'Xenova/mistral-tokenizer-v3': 'Mistral v3',
'Xenova/mistral-tokenizer-v1': 'Mistral v1',
'Xenova/gemma-tokenizer': 'Gemma',
'Xenova/llama-3-tokenizer': 'Llama 3',
'Xenova/llama-tokenizer': 'LLaMA / Llama 2',
'Xenova/c4ai-command-r-v01-tokenizer': 'Cohere Command-R',
'Xenova/t5-small': 'T5',
'Xenova/bert-base-cased': 'bert-base-cased',
'': 'Custom',
})

function App() {
// Allow user to set tokenizer and text via URL query parameters
Expand All @@ -14,6 +31,7 @@ function App() {
const [margins, setMargins] = useState([])
const [outputOption, setOutputOption] = useState('text');
const [tokenizer, setTokenizer] = useState(tokenizerParam ?? 'Xenova/gpt-4');
const [customTokenizer, setCustomTokenizer] = useState('');

const textareaRef = useRef(null);
const outputRef = useRef(null);
Expand Down Expand Up @@ -44,6 +62,13 @@ function App() {
return () => worker.current.removeEventListener('message', onMessageReceived);
}, []);

const resetOutput = useCallback(() => {
setOutputOption('text');
setTokenIds([]);
setDecodedTokens([]);
setMargins([]);
}, []);

const onInputChange = useCallback((e) => {
const model_id = tokenizer;
const text = e.target.value;
Expand All @@ -64,8 +89,10 @@ function App() {
const onTokenizerChange = useCallback((e) => {
const model_id = e.target.value;
setTokenizer(model_id);
if (!model_id) return;
worker.current.postMessage({ model_id, text: textareaRef.current.value });
}, []);

return (
<div className='w-full max-w-[720px] flex flex-col gap-4 items-center'>

Expand All @@ -74,21 +101,28 @@ function App() {
<h2 className='text-lg font-normal'>Experiment with different tokenizers (running <a className="text-gray-900 underline" href="https://github.com/xenova/transformers.js">locally</a> in your browser).</h2>
</div>


<div>
<select value={tokenizer} onChange={onTokenizerChange} className="bg-gray-50 border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2">
<option value="Xenova/gpt-4">gpt-4 / gpt-3.5-turbo / text-embedding-ada-002</option>
<option value="Xenova/text-davinci-003">text-davinci-003 / text-davinci-002</option>
<option value="Xenova/gpt-3">gpt-3</option>
<option value="Xenova/grok-1-tokenizer">Grok-1</option>
<option value="Xenova/claude-tokenizer">Claude</option>
<option value="Xenova/mistral-tokenizer">Mistral</option>
<option value="Xenova/gemma-tokenizer">Gemma</option>
<option value="Xenova/llama-tokenizer">LLaMA / Llama 2</option>
<option value="Xenova/c4ai-command-r-v01-tokenizer">Cohere Command-R</option>
<option value="Xenova/t5-small">T5</option>
<option value="Xenova/bert-base-cased">bert-base-cased</option>
<select value={(tokenizer in TOKENIZER_OPTIONS && !customTokenizer) ? tokenizer : ''} onChange={(e) => {
resetOutput();
setCustomTokenizer('');
onTokenizerChange(e);
}} className="bg-gray-50 border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2">
{Object.entries(TOKENIZER_OPTIONS).map(([value, label]) => (
<option key={value} value={value}>{label}</option>
))}
</select>
{(!(tokenizer in TOKENIZER_OPTIONS) || customTokenizer || tokenizer === '') && (
<input
type="text"
placeholder="Custom tokenizer"
defaultValue={customTokenizer || tokenizer}
onChange={(e) => {
setCustomTokenizer(e.target.value);
onTokenizerChange(e);
}}
className="bg-white border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full py-1 px-2 mt-1"
/>
)}
</div>


Expand Down

0 comments on commit bd31552

Please sign in to comment.