diff --git a/scripts/tokenizer-stats.js b/scripts/tokenizer-stats.js
old mode 100644
new mode 100755
index 026cd8a8..dd0a762d
--- a/scripts/tokenizer-stats.js
+++ b/scripts/tokenizer-stats.js
@@ -1,13 +1,75 @@
+#!/usr/bin/env node
+
 const fs = require('fs');
 
+const CLASS_EXAMPLES = true;
 const LONG_ID = 200;
 
+const classes = {
+  'EMPTY': /^$/,
+  'PUNCT': /^[^\p{L}0-9]+$/u,
+  'NUM': /^[0-9]+$/,
+  'ALPHA': /^[\p{L}']+$/u,
+  'SPECE_ALPHA': /^ [\p{L}']+$/u,
+  'SPECE_NUM': /^ [0-9]+$/,
+  'BYTE_FALLBACK': /^\<0x[0-9A-F][0-9A-F]\>$/,
+  'PUNCT_ALPHA': /^[^\p{L}'0-9 ][\p{L}]+$/u,
+  'BAD_UTF': /�/,
+}
+
+
+function charMap() {
+  const res = {}
+  let k = 0x100
+  for (let byte = 0; byte <= 255; byte++) {
+    const c = String.fromCharCode(byte)
+    if (c.match(/[\!-\~\u00A1-\u00AC\u00AE-\u00FF]/)) {
+      res[c] = byte
+    } else {
+      res[String.fromCharCode(k)] = byte
+      k += 1
+    }
+  }
+  return res
+}
+
+const char_map = charMap()
+
+function tokenNameToBytes(tok_name) {
+  const bytes = []
+  for (const c of tok_name) {
+    const code = char_map[c]
+    if (code === undefined) {
+      throw new Error(`missing char: ${c}`)
+    }
+    bytes.push(code)
+  }
+  return bytes
+}
+
+function tokenNameToString(tok_name) {
+  return Buffer.from(tokenNameToBytes(tok_name)).toString('utf8')
+}
+
+function padNum(n) {
+  return n.toString().padStart(6, ' ')
+}
+
+
 function stats(fn) {
   console.log(fn)
   const tokenizer = JSON.parse(fs.readFileSync(fn, 'utf8'))
 
+  const addedTokens = {}
+  for (const added of tokenizer.added_tokens) {
+    addedTokens[added.content] = true
+  }
+
   const numbylen = {}
   let max_id = 0
+  const tokensByClass = {}
+
+  const isFallback = tokenizer.model.vocab.hasOwnProperty("▁▁▁");
 
   for (const [str, id] of Object.entries(tokenizer.model.vocab)) {
     const len = Math.floor(str.length / 10)
@@ -19,14 +81,58 @@ function stats(fn) {
     if (str.length > LONG_ID) {
       console.log("Long token: ", id, JSON.stringify(str))
     }
+    const t = isFallback ? str.replace(/▁/g, " ") : tokenNameToString(str)
+    const tclasses = []
+    if (addedTokens.hasOwnProperty(str)) {
+      tclasses.push('ADDED')
+    } else {
+      const t2 = t.replace(/\n/g, "\t")
+      for (const [n, rx] of Object.entries(classes)) {
+        if (t2.match(rx)) {
+          tclasses.push(n)
+        }
+      }
+    }
+    if (tclasses.length == 0) {
+      console.log("No class: ", id, JSON.stringify(t))
+      tclasses.push('UNKNOWN')
+    } else if (tclasses.length > 1 && !tclasses.includes('BAD_UTF') && !/^[ ']+$/.test(t)) {
+      console.log("Multiple classes: ", id, JSON.stringify(t), tclasses)
+    }
+
+    for (const c of tclasses) {
+      if (!tokensByClass[c]) {
+        tokensByClass[c] = []
+      }
+      tokensByClass[c].push(t)
+    }
   }
 
   console.log("Max ID: ", max_id)
   console.log("Length distribution:")
   for (const [len0, num] of Object.entries(numbylen)) {
     const len = parseInt(len0)
-    console.log(`   ${len * 10}-${(len + 1) * 10 - 1}`, num)
+    console.log(padNum(num), `${len * 10}-${(len + 1) * 10 - 1}`)
+  }
+  console.log("Classes:")
+  for (const [c, elts] of Object.entries(tokensByClass)) {
+    console.log(padNum(elts.length), c, "   ", classes[c])
+    if (CLASS_EXAMPLES) {
+      console.log("     ", JSON.stringify(permute(elts).slice(0, 10)))
+    }
+  }
+}
+
+function permute(arr) {
+  // randomly permute elts of array
+  const res = arr.slice()
+  for (let i = 0; i < arr.length; i++) {
+    const j = Math.floor(Math.random() * arr.length)
+    const tmp = res[i]
+    res[i] = res[j]
+    res[j] = tmp
   }
+  return res
 }
 
 for (let i = 2; i < process.argv.length; i++) {