usefulsensors · keveman · Oct 24, 2024 · Oct 24, 2024
diff --git a/README.md b/README.md
@@ -88,6 +88,12 @@ export KERAS_BACKEND=jax
 # Use useful-moonshine[jax-cuda] for jax on GPU
 ```
 
+To run with ONNX runtime that is supported on platforms, run the following:
+
+```shell
+uv pip install useful-moonshine[onnx]@git+https://github.com/usefulsensors/moonshine.git
+```
+
 ### 3. Try it out
 
 You can test Moonshine by transcribing the provided example audio file with the `.transcribe` function:
@@ -100,6 +106,7 @@ python
 ```
 
 The first argument is a path to an audio file and the second is the name of a Moonshine model. `moonshine/tiny` and `moonshine/base` are the currently available models.
+Use the `moonshine.transcribe_with_onnx` function to use the ONNX runtime for inference. The parameters are the same as they are for `moonshine.transcribe`.
 
 ## TODO
 * [ ] Live transcription demo

diff --git a/moonshine/__init__.py b/moonshine/__init__.py
@@ -4,4 +4,11 @@
 ASSETS_DIR = Path(__file__).parents[0] / "assets"
 
 from .model import load_model, Moonshine
-from .transcribe import transcribe, benchmark, load_tokenizer, load_audio
+from .transcribe import (
+    transcribe,
+    benchmark,
+    load_tokenizer,
+    load_audio,
+    transcribe_with_onnx,
+)
+from .onnx_model import MoonshineOnnxModel
diff --git a/moonshine/onnx_model.py b/moonshine/onnx_model.py
@@ -0,0 +1,70 @@
+def _get_onnx_weights(model_name):
+    from huggingface_hub import hf_hub_download
+
+    repo = "UsefulSensors/moonshine"
+
+    return (
+        hf_hub_download(repo, f"{x}.onnx", subfolder=f"onnx/{model_name}")
+        for x in ("preprocess", "encode", "uncached_decode", "cached_decode")
+    )
+
+
+class MoonshineOnnxModel(object):
+    def __init__(self, models_dir=None, model_name=None):
+        import onnxruntime
+
+        if models_dir is None:
+            assert (
+                model_name is not None
+            ), "model_name should be specified if models_dir is not"
+            preprocess, encode, uncached_decode, cached_decode = (
+                self._load_weights_from_hf_hub(model_name)
+            )
+        else:
+            preprocess, encode, uncached_decode, cached_decode = [
+                f"{models_dir}/{x}.onnx"
+                for x in ["preprocess", "encode", "uncached_decode", "cached_decode"]
+            ]
+        self.preprocess = onnxruntime.InferenceSession(preprocess)
+        self.encode = onnxruntime.InferenceSession(encode)
+        self.uncached_decode = onnxruntime.InferenceSession(uncached_decode)
+        self.cached_decode = onnxruntime.InferenceSession(cached_decode)
+
+    def _load_weights_from_hf_hub(self, model_name):
+        model_name = model_name.split("/")[-1]
+        return _get_onnx_weights(model_name)
+
+    def generate(self, audio, max_len=None):
+        "audio has to be a numpy array of shape [1, num_audio_samples]"
+        if max_len is None:
+            # max 6 tokens per second of audio
+            max_len = int((audio.shape[-1] / 16_000) * 6)
+        preprocessed = self.preprocess.run([], dict(args_0=audio))[0]
+        seq_len = [preprocessed.shape[-2]]
+
+        context = self.encode.run([], dict(args_0=preprocessed, args_1=seq_len))[0]
+        inputs = [[1]]
+        seq_len = [1]
+
+        tokens = [1]
+        logits, *cache = self.uncached_decode.run(
+            [], dict(args_0=inputs, args_1=context, args_2=seq_len)
+        )
+        for i in range(max_len):
+            next_token = logits.squeeze().argmax()
+            tokens.extend([next_token])
+            if next_token == 2:
+                break
+
+            seq_len[0] += 1
+            inputs = [[next_token]]
+            logits, *cache = self.cached_decode.run(
+                [],
+                dict(
+                    args_0=inputs,
+                    args_1=context,
+                    args_2=seq_len,
+                    **{f"args_{i+3}": x for i, x in enumerate(cache)},
+                ),
+            )
+        return [tokens]
diff --git a/moonshine/tools/onnx_model.py b/moonshine/tools/onnx_model.py
diff --git a/moonshine/transcribe.py b/moonshine/transcribe.py
@@ -40,6 +40,21 @@ def transcribe(audio, model="moonshine/base"):
     return load_tokenizer().decode_batch(tokens)
 
 
+def transcribe_with_onnx(audio, model="moonshine/base"):
+    from .onnx_model import MoonshineOnnxModel
+
+    if isinstance(model, str):
+        model = MoonshineOnnxModel(model_name=model)
+    assert isinstance(
+        model, MoonshineOnnxModel
+    ), f"Expected a MoonshineOnnxModel model or a model name, not a {type(model)}"
+    audio = load_audio(audio, return_numpy=True)
+    assert_audio_size(audio)
+
+    tokens = model.generate(audio)
+    return load_tokenizer().decode_batch(tokens)
+
+
 def load_tokenizer():
     tokenizer_file = ASSETS_DIR / "tokenizer.json"
     tokenizer = tokenizers.Tokenizer.from_file(str(tokenizer_file))

diff --git a/setup.py b/setup.py
@@ -32,6 +32,7 @@ def read_version(fname="moonshine/version.py"):
         "tensorflow": ["tensorflow==2.17.0"],
         "jax": ["jax==0.4.34", "keras==3.6.0"],
         "jax-cuda": ["jax[cuda12]", "keras==3.6.0"],
+        "onnx": ["onnxruntime>=1.19.2"],
     },
     include_package_data=True,
 )