diff --git a/examples/tokenizer-playground/package-lock.json b/examples/tokenizer-playground/package-lock.json index e3a04b4de..67f9bcb84 100644 --- a/examples/tokenizer-playground/package-lock.json +++ b/examples/tokenizer-playground/package-lock.json @@ -8,7 +8,7 @@ "name": "tokenizer-playground", "version": "0.0.0", "dependencies": { - "@xenova/transformers": "^2.15.1", + "@xenova/transformers": "^2.17.1", "react": "^18.2.0", "react-dom": "^18.2.0" }, @@ -811,9 +811,9 @@ } }, "node_modules/@huggingface/jinja": { - "version": "0.1.3", - "resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.1.3.tgz", - "integrity": "sha512-9KsiorsdIK8+7VmlamAT7Uh90zxAhC/SeKaKc80v58JhtPYuwaJpmR/ST7XAUxrHAFqHTCoTH5aJnJDwSL6xIQ==", + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.2.2.tgz", + "integrity": "sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA==", "engines": { "node": ">=18" } @@ -1086,11 +1086,11 @@ } }, "node_modules/@xenova/transformers": { - "version": "2.15.1", - "resolved": "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.15.1.tgz", - "integrity": "sha512-HX3kUZbr9v90PS/D2SyffGiFYQ6wQMbzwC1uLuCOA6nRSFOdr0TDnOTxfjS0RB6Phn6ThCTE1vX4n+NUsuobQg==", + "version": "2.17.1", + "resolved": "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.17.1.tgz", + "integrity": "sha512-zo702tQAFZXhzeD2GCYUNUqeqkoueOdiSbQWa4s0q7ZE4z8WBIwIsMMPGobpgdqjQ2u0Qulo08wuqVEUrBXjkQ==", "dependencies": { - "@huggingface/jinja": "^0.1.3", + "@huggingface/jinja": "^0.2.2", "onnxruntime-web": "1.14.0", "sharp": "^0.32.0" }, diff --git a/examples/tokenizer-playground/package.json b/examples/tokenizer-playground/package.json index 0f7500879..0d77ce8db 100644 --- a/examples/tokenizer-playground/package.json +++ b/examples/tokenizer-playground/package.json @@ -10,7 +10,7 @@ "preview": "vite preview" }, "dependencies": { - "@xenova/transformers": "^2.15.1", + "@xenova/transformers": "^2.17.1", "react": "^18.2.0", "react-dom": "^18.2.0" }, diff --git a/examples/tokenizer-playground/src/App.jsx b/examples/tokenizer-playground/src/App.jsx index 98173f8fb..1e1a286c3 100644 --- a/examples/tokenizer-playground/src/App.jsx +++ b/examples/tokenizer-playground/src/App.jsx @@ -1,7 +1,24 @@ import { useCallback, useEffect, useRef, useState } from 'react' -import './App.css' import { Token } from './components/Token' +import './App.css' +// Define list of tokenizers and their corresponding human-readable names +const TOKENIZER_OPTIONS = Object.freeze({ + 'Xenova/gpt-4': 'gpt-4 / gpt-3.5-turbo / text-embedding-ada-002', + 'Xenova/text-davinci-003': 'text-davinci-003 / text-davinci-002', + 'Xenova/gpt-3': 'gpt-3', + 'Xenova/grok-1-tokenizer': 'Grok-1', + 'Xenova/claude-tokenizer': 'Claude', + 'Xenova/mistral-tokenizer-v3': 'Mistral v3', + 'Xenova/mistral-tokenizer-v1': 'Mistral v1', + 'Xenova/gemma-tokenizer': 'Gemma', + 'Xenova/llama-3-tokenizer': 'Llama 3', + 'Xenova/llama-tokenizer': 'LLaMA / Llama 2', + 'Xenova/c4ai-command-r-v01-tokenizer': 'Cohere Command-R', + 'Xenova/t5-small': 'T5', + 'Xenova/bert-base-cased': 'bert-base-cased', + '': 'Custom', +}) function App() { // Allow user to set tokenizer and text via URL query parameters @@ -14,6 +31,7 @@ function App() { const [margins, setMargins] = useState([]) const [outputOption, setOutputOption] = useState('text'); const [tokenizer, setTokenizer] = useState(tokenizerParam ?? 'Xenova/gpt-4'); + const [customTokenizer, setCustomTokenizer] = useState(''); const textareaRef = useRef(null); const outputRef = useRef(null); @@ -44,6 +62,13 @@ function App() { return () => worker.current.removeEventListener('message', onMessageReceived); }, []); + const resetOutput = useCallback(() => { + setOutputOption('text'); + setTokenIds([]); + setDecodedTokens([]); + setMargins([]); + }, []); + const onInputChange = useCallback((e) => { const model_id = tokenizer; const text = e.target.value; @@ -64,8 +89,10 @@ function App() { const onTokenizerChange = useCallback((e) => { const model_id = e.target.value; setTokenizer(model_id); + if (!model_id) return; worker.current.postMessage({ model_id, text: textareaRef.current.value }); }, []); + return (
@@ -74,21 +101,28 @@ function App() {

Experiment with different tokenizers (running locally in your browser).

-
- { + resetOutput(); + setCustomTokenizer(''); + onTokenizerChange(e); + }} className="bg-gray-50 border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2"> + {Object.entries(TOKENIZER_OPTIONS).map(([value, label]) => ( + + ))} + {(!(tokenizer in TOKENIZER_OPTIONS) || customTokenizer || tokenizer === '') && ( + { + setCustomTokenizer(e.target.value); + onTokenizerChange(e); + }} + className="bg-white border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full py-1 px-2 mt-1" + /> + )}