diff --git a/.github/ISSUE_TEMPLATE/bug.yml b/.github/ISSUE_TEMPLATE/bug.yml new file mode 100644 index 0000000..a2feb49 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug.yml @@ -0,0 +1,79 @@ +name: Bug Report +description: You found a bug. +labels: ["bug", "triage"] +body: + - type: dropdown + id: backend + attributes: + label: Backend impacted + description: Which backend is concerned with your bug report? + options: + - The PyTorch implementation + - The MLX implementation + - The Rust implementation + - Other / All + default: 0 + validations: + required: true + - type: dropdown + id: os + attributes: + label: Operating system + description: What is your operating system? + options: + - Linux + - Mac OS X + - Windows (unsupported) + default: 0 + validations: + required: true + - type: dropdown + id: hardware + attributes: + label: Hardware + description: What hardware are you using? + options: + - CPU + - GPU with CUDA + - Metal with MLX + default: 0 + validations: + required: true + - type: textarea + id: description + attributes: + label: Description + description: Provide a detailed description of your bug. + placeholder: + value: + validations: + required: true + - type: textarea + id: more_info + attributes: + label: Extra information + description: Please provide any other relevant information, such as log extracts, code etc. + placeholder: + value: + validations: + required: true + - type: textarea + id: env + attributes: + label: Environment + description: Please provide any other relevant information, such as log extracts, code etc. + placeholder: + value: | + Fill in the following information on your system. + - Operating system version: + + If the backend impacted is PyTorch: + - Python version: + - PyTorch version: + - CUDA version (run `python -c 'import torch; print(torch.version.cuda)'`): + - GPU model and memory: + + If the backend is MLX: + - Mac model: + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug_old.md similarity index 100% rename from .github/ISSUE_TEMPLATE/bug.md rename to .github/ISSUE_TEMPLATE/bug_old.md diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md deleted file mode 100644 index a074579..0000000 --- a/.github/ISSUE_TEMPLATE/question.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -name: "❓Questions/Help/Support" -about: If you have a question about the paper, code or algorithm, please ask here! -labels: question, triage ---- - -## ❓ Questions - - diff --git a/.github/ISSUE_TEMPLATE/question.yml b/.github/ISSUE_TEMPLATE/question.yml new file mode 100644 index 0000000..d560591 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.yml @@ -0,0 +1,36 @@ +name: Question +description: You have a question about Moshi/Mimi, this codebase. +labels: ["question", "triage"] +body: + - type: checkboxes + id: terms + attributes: + label: Due diligence + description: Have you searched the existing issues / Google / asked ChatGPT? + options: + - label: I have done my due diligence in trying to find the answer myself. + required: true + + - type: dropdown + id: backend + attributes: + label: Topic + description: What is your question about? + options: + - The paper + - The PyTorch implementation + - The MLX implementation + - The Rust implementation + - Other / All + default: 0 + validations: + required: true + - type: textarea + id: question + attributes: + label: Question + description: What is your question? + placeholder: Your question. Please make sure this is directly related to our codebase. We will not provide support for installing PyTorch, CUDA, Rust etc. + value: + validations: + required: true diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..b66b039 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,9 @@ +## Checklist + +- [ ] Read CONTRIBUTING.md, and accept the CLA by including the provided snippet. We will not accept PR without this. +- [ ] Run pre-commit hook. +- [ ] If you changed Rust code, run `cargo check`, `cargo clippy`, `cargo test`. + +## PR Description + + diff --git a/.github/actions/moshi_build/action.yml b/.github/actions/moshi_build/action.yml index 2be3812..16bee44 100755 --- a/.github/actions/moshi_build/action.yml +++ b/.github/actions/moshi_build/action.yml @@ -22,6 +22,6 @@ runs: - name: Setup env shell: bash run: | - . env/bin/activate + source env/bin/activate || ( echo "FAILED" && ls && ls env && exit 1) pre-commit install pip install -e './moshi[dev]' diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index af18af5..2621176 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,20 +4,47 @@ Moshi is the implementation of a research paper. Therefore, we do not plan on accepting many pull requests for new features. -We certainly welcome them for bug fixes. +However, we certainly welcome them for bug fixes. 1. Fork the repo and create your branch from `main`. -2. If you've changed APIs, update the documentation. +2. If you have changed APIs, update the documentation accordingly. 3. Ensure pre-commit hooks pass properly, in particular the linting and typing. -4. Accept the Contributor License Agreement (see after). +4. When changing the Rust code, run `cargo check`, `cargo clippy`, `cargo test`. +5. Accept the Contributor License Agreement (see after). -Note that in general we will not accept refactoring of the code. +Note that in general, we will not accept refactoring of the code. ## Contributor License Agreement ("CLA") -In order to accept your pull request, we need you to submit a Contributtor License Agreement. -As this CLA is not ready yet, we will delay acceptance of your PR. +In order to accept your pull request, we need you to submit a Contributor License Agreement. + +If you agree with the full CLA provided in the next paragraph, copy the following statement in your PR, changing your Github Handle: + +> I, {your GitHub handle}, confirm that I have read and understood the terms of the CLA of Kyutai-labs, as outlined in the repository's CONTRIBUTING.md, and I agree to be bound by these terms. + +The full CLA is provided as follows: + +> I, {your GitHub handle}, hereby grant to Kyutai-labs a perpetual, worldwide, non-exclusive, royalty-free, +> irrevocable license to use, modify, distribute, and sublicense my Contributions. + +> I understand and accept that Contributions are limited to modifications, improvements, or changes +> to the project’s source code submitted via pull requests. I accept that Kyutai-labs has full discretion to +> review, accept, reject, or request changes to any Contributions I submit, and that submitting +> a pull request does not guarantee its inclusion in the project. + +> By submitting a Contribution, I grant Kyutai-labs a perpetual, worldwide license to use, modify, +> reproduce, distribute, and create derivative works based on my Contributions. +> I also agree to assign all patent rights for any inventions or improvements that arise from my Contributions, +> giving the Kyutai-labs full rights to file for and enforce patents. +> I understand that the Kyutai-labs may commercialize, relicense, or exploit the project and my Contributions without further notice or obligation to me. +> I confirm that my Contributions are original and that I have the legal right to grant this license. +> If my Contributions include third-party materials, I will ensure that I have the necessary permissions +> and will disclose this information. I accept that once my Contributions are integrated, they may be altered or removed at the Kyutai-labs’s discretion. + +> I acknowledge that I am making these Contributions voluntarily and will not receive any compensation. +> Furthermore, I understand that all Contributions, including mine, are provided on an "as-is" basis, with no warranties. +> By submitting a pull request, I agree to be bound by these terms. ## Issues diff --git a/README.md b/README.md index 8dfb398..623cad8 100644 --- a/README.md +++ b/README.md @@ -1,54 +1,53 @@ -# Moshi: a speech-text fundation model for real time dialogue +# Moshi: a speech-text foundation model for real time dialogue ![precommit badge](https://github.com/kyutai-labs/moshi/workflows/precommit/badge.svg) ![rust ci badge](https://github.com/kyutai-labs/moshi/workflows/Rust%20CI/badge.svg) [Moshi][moshi] is a speech-text foundation model and **full-duplex** spoken dialogue framework. - It uses [Mimi][moshi], a state-of-the-art streaming neural audio codec. Mimi operates at 12.5 Hz, and compresses - audio down to 1.1 kbps, in a fully streaming manner (latency of 80ms, the frame size), + It uses [Mimi][moshi], a state-of-the-art streaming neural audio codec. Mimi processes 24 kHz audio, down to a 12.5 Hz representation + with a bandwidth of 1.1 kbps, in a fully streaming manner (latency of 80ms, the frame size), yet performs better than existing, non-streaming, codec like - [SpeechTokenizer](https://github.com/ZhangXInFD/SpeechTokenizer) (50 Hz, 4 kbps), or [SemantiCodec](https://github.com/haoheliu/SemantiCodec-inference) (50 Hz, 1kbps). + [SpeechTokenizer](https://github.com/ZhangXInFD/SpeechTokenizer) (50 Hz, 4kbps), or [SemantiCodec](https://github.com/haoheliu/SemantiCodec-inference) (50 Hz, 1.3kbps). - Moshi models **two streams of audio**: one corresponds to Moshi, and one to the user. + Moshi models **two streams of audio**: one corresponds to Moshi, and the other one to the user. At inference, the stream from the user is taken from the audio input, and the one for Moshi is sampled from the model's output. Along these two audio streams, Moshi predicts text tokens corresponding to its own speech, its **inner monologue**, which greatly improves the quality of its generation. A small Depth Transformer models inter codebook dependencies for a given time step, -while a large, 7B parameter Transformer models the temporal dependencies. Moshi achieves a theoretical latency -of 160ms (80ms for the frame size of Mimi + 80ms of acoustic delay), with a practical overall latency as low as 200ms. +while a large, 7B parameter Temporal Transformer models the temporal dependencies. Moshi achieves a theoretical latency +of 160ms (80ms for the frame size of Mimi + 80ms of acoustic delay), with a practical overall latency as low as 200ms on an L4 GPU. + [Talk to Moshi](https://moshi.chat) now on our live demo. +

-Schema representing the structure Moshi. Moshi models two streams of audio:
-    one corresponds to Moshi, and one to the user. At inference, the one from the user is taken from the audio input,
-    and the one for Moshi is sampled from. Along that, Moshi predicts text tokens corresponding to its own speech
-    for improved accuracy. A small depth transformer models inter codebook dependencies for a given step.

Mimi builds on previous neural audio codecs such as [SoundStream](https://arxiv.org/abs/2107.03312) and [EnCodec](https://github.com/facebookresearch/encodec), adding a Transformer both in the encoder and decoder, and adapting the strides to match an overall frame rate of 12.5 Hz. This allows Mimi to get closer to the -average frame rate of text tokens (~3-4 Hz), and limit the number of auto-regressive steps in Moshi. +average frame rate of text tokens (~3-4 Hz), and limit the number of autoregressive steps in Moshi. Similarly to SpeechTokenizer, Mimi uses a distillation loss so that the first codebook tokens match -a self-supervised representation from [WavLM](https://arxiv.org/abs/2110.13900). Interestingly, while -Mimi is fully causal and streaming, it learns to match sufficiently well the non causal representation from WavLM, -without introducing any delays. Finally, and similary to [EBEN](https://arxiv.org/pdf/2210.14090), Mimi -uses **only an adversarial training loss**, along with feature matching, showing strong improvements in terms of subjective quality -despite its low bitrate. +a self-supervised representation from [WavLM](https://arxiv.org/abs/2110.13900), which allows modeling semantic and acoustic information with a single model. Interestingly, while Mimi is fully causal and streaming, it learns to match sufficiently well the non-causal +representation from WavLM, without introducing any delays. Finally, and similarly to [EBEN](https://arxiv.org/pdf/2210.14090), +Mimi uses **only an adversarial training loss**, along with feature matching, showing strong improvements in terms of +subjective quality despite its low bitrate.

-Schema representing the structure Moshi. Moshi models two streams of audio:
-    one corresponds to Moshi, and one to the user. At inference, the one from the user is taken from the audio input,
-    and the one for Moshi is sampled from. Along that, Moshi predicts text tokens corresponding to its own speech
-    for improved accuracy. A small depth transformer models inter codebook dependencies for a given step.

+ ## Organisation of the repository There are three separate versions of the moshi inference stack in this repo. -- The python version using PyTorch is in the [`moshi/`](moshi/) directory. -- The python version using MLX for M series Macs is in the [`moshi_mlx/`](moshi_mlx/) directory. -- The rust version used in production is in the [`rust/`](rust/) directory. +- The Python version using PyTorch is in the [`moshi/`](moshi/) directory. +- The Python version using MLX for M series Macs is in the [`moshi_mlx/`](moshi_mlx/) directory. +- The Rust version used in production is in the [`rust/`](rust/) directory. This contains in particular a Mimi implementation in Rust, with Python bindings available as `rustymimi`. @@ -63,15 +62,17 @@ We release three models: - Moshi fine-tuned on a female synthetic voice (Moshika). Depending on the backend, the file format and quantization available will vary. Here is the list -of the HuggingFace repo with each model. Mimi is bundled in any of those, and always use the same checkpoint format. +of the HuggingFace repo with each model. Mimi is bundled in each of those, and always use the same checkpoint format. - Moshika for PyTorch (bf16): [kmhf/moshika-pytorch-bf16](https://huggingface.co/kmhf/moshika-pytorch-bf16). - Moshiko for PyTorch (bf16): [kmhf/moshiko-pytorch-bf16](https://huggingface.co/kmhf/moshiko-pytorch-bf16). - Moshika for MLX (int4, int8, bf16): [kmhf/moshiko-mlx-q4](https://huggingface.co/kmhf/moshika-mlx-q4), [kmhf/moshiko-mlx-q8](https://huggingface.co/kmhf/moshika-mlx-q8), [kmhf/moshiko-mlx-bf16](https://huggingface.co/kmhf/moshika-mlx-bf16). - Moshiko for MLX (int4, int8, bf16): [kmhf/moshiko-mlx-q4](https://huggingface.co/kmhf/moshiko-mlx-q4), [kmhf/moshiko-mlx-q8](https://huggingface.co/kmhf/moshiko-mlx-q8), [kmhf/moshiko-mlx-bf16](https://huggingface.co/kmhf/moshiko-mlx-bf16). -- Moshiko for Rust/Candle (int8, bf16): [kmhf/moshika-candle-q8](https://huggingface.co/kmhf/moshika-candle-q8), [kmhf/moshiko-mlx-bf16](https://huggingface.co/kmhf/moshika-candle-bf16). +- Moshika for Rust/Candle (int8, bf16): [kmhf/moshika-candle-q8](https://huggingface.co/kmhf/moshika-candle-q8), [kmhf/moshiko-mlx-bf16](https://huggingface.co/kmhf/moshika-candle-bf16). - Moshiko for Rust/Candle (int8, bf16): [kmhf/moshiko-candle-q8](https://huggingface.co/kmhf/moshiko-candle-q8), [kmhf/moshiko-mlx-bf16](https://huggingface.co/kmhf/moshiko-candle-bf16). +All models are released under the CC-BY 4.0 license. + ## Requirements You will need at least Python 3.10. For specific requirements, please check the individual backends @@ -87,11 +88,14 @@ pip install -e "git+https://git@github.com/kyutai-labs/moshi.git#egg=moshi_mlx&s pip install rustymimi # mimi, rust implementation with Python bindings from PyPI ``` +If you get an error when installing `moshi_mlx` or `rustymimi` (which `moshi_mlx` depends on), +you might need to install the [Rust toolchain](https://rustup.rs/) to install `rustymimi` from sources. + While we hope that the present codebase will work on Windows, we do not provide official support for it. -We have tested the MLX version with MacBook Pro M3. At the moment, we do not support quantization +We have tested the MLX version on a MacBook Pro M3. At the moment, we do not support quantization for the PyTorch version, so you will need a GPU with a significant amount of memory (24GB). -For using the rust backend, you will need a recent version of the [Rust toolchain](https://rustup.rs/). +For using the Rust backend, you will need a recent version of the [Rust toolchain](https://rustup.rs/). To compile GPU support, you will also need the [CUDA](https://developer.nvidia.com/cuda-toolkit) properly installed for your GPU, in particular with `nvcc`. ## Development @@ -112,7 +116,7 @@ maturin dev -r -m rust/mimi-pyo3/Cargo.toml ## Python (PyTorch) -The Pytorch based API can be found in the `moshi` directory. It provides a streaming +The PyTorch based API can be found in the `moshi` directory. It provides a streaming version of the audio tokenizer (mimi) and the language model (moshi). In order to run in interactive mode, you need to start a server which will @@ -126,12 +130,12 @@ python -m moshi.server [--gradio-tunnel] [--hf-repo kmhf/moshika-pytorch-bf16] And then access the web UI on [localhost:8998](http://localhost:8998). If your GPU is on a distant machine with no direct access, `--gradio-tunnel` will create a tunnel with a URL accessible from anywhere. Keep in mind that this tunnel goes through the US and can add significant latency (up to 500ms from Europe). -You can use `--gradio-tunnel-token` to set a fixed secret and reuse the same address over time. +You can use `--gradio-tunnel-token` to set a fixed secret token and reuse the same address over time. Alternatively, you might want to use SSH to redirect your connection. You can use `--hf-repo` to select a different pretrained model, by setting the proper Hugging Face repository. -Accessing a server that is not localhost via http may cause issues around using +Accessing a server that is not localhost via http may cause issues with using the microphone in the web UI (in some browsers this is only allowed using https). @@ -139,7 +143,7 @@ A local client is also available, as ```bash python -m moshi.client [--url URL_TO_GRADIO] ``` -However note, that unlike the web browser, this client is bare bone. It doesn't do any echo cancellation, +However note that, unlike the web browser, this client is barebone: It does not perform any echo cancellation, nor does it try to compensate for a growing lag by skipping frames. For more information, in particular on how to use the API directly, please @@ -157,15 +161,15 @@ python -m moshi_mlx.local -q 8 --hf-repo kmhf/moshika-mlx-q8 # be careful to always match the `-q` and `--hf-repo` flag. ``` -This uses a command line interface, which is bare bone. It doesn't do any echo cancellation, +This command line interface is also barebone. It does not perform any echo cancellation, nor does it try to compensate for a growing lag by skipping frames. -Alternatively you can use `python -m moshi_mlx.local_web` to use -the web UI, connection is via http on [localhost:8998](http://localhost:8998). +Alternatively you can run `python -m moshi_mlx.local_web` to use +the web UI, the connection is via http and will be at [localhost:8998](http://localhost:8998). ## Rust -In order to run the rust inference server, use the following command from within +In order to run the Rust inference server, use the following command from within the `rust` directory: ```bash @@ -175,24 +179,29 @@ cargo run --features cuda --bin moshi-backend -r -- --config moshi-backend/confi When using macOS, you can replace `--features cuda` with `--features metal`. Alternatively you can use `config-q8.json` rather than `config.json` to use the -quantified q8 model. You can select a different pretrained model, e.g. Moshika, +quantized q8 model. You can select a different pretrained model, e.g. Moshika, by changing the `"hf_repo"` key in either file. Once the server has printed 'standalone worker listening', you can use the web -UI. By default the rust version uses https so it will be at +UI. By default the Rust server uses https so it will be at [localhost:8998](https://localhost:8998). -You will get some warnings about the site being unsafe. When using chrome you -can bypass it by selecting "Details" or "Advanced", then "Visit this unsafe +You will get warnings about the site being unsafe. When using chrome you +can bypass these by selecting "Details" or "Advanced", then "Visit this unsafe site" or "Proceed to localhost (unsafe)". ## Clients -We recommend using the web UI as it provides some echo cancellation that helps -the overall model quality. Alternatively we provide some command line interfaces -for the rust and python versions, the protocol is the same as with the web UI so +We recommend using the web UI as it provides additional echo cancellation that helps +the overall model quality. Note that most command will directly serve this UI +in the provided URL, and there is in general nothing more to do. + +Alternatively, we provide command line interfaces +for the Rust and Python versions, the protocol is the same as with the web UI so there is nothing to change on the server side. +For reference, here is the list of clients for Moshi. + ### Rust Command Line From within the `rust` directory, run the following: @@ -239,4 +248,4 @@ If you use either Mimi or Moshi, please cite the following paper, } ``` -[moshi]: https://arxiv.org/ +[moshi]: https://kyutai.org/Moshi.pdf diff --git a/moshi.png b/moshi.png index 0e6c731..e2dfdfe 100644 Binary files a/moshi.png and b/moshi.png differ diff --git a/moshi/README.md b/moshi/README.md index 03b8527..00e2239 100644 --- a/moshi/README.md +++ b/moshi/README.md @@ -3,8 +3,8 @@ See the [top-level README.md][main_repo] for more information on Moshi. [Moshi][moshi] is a speech-text foundation model and full-duplex spoken dialogue framework. -It uses [Mimi][moshi], a state-of-the-art streaming neural audio codec. Mimi operates at 12.5 Hz, and compress -audio down to 1.1 kbps, in a fully streaming manner (latency of 80ms, the frame size), yet performs better than existing, non-streaming, codec. +It uses [Mimi][moshi], a state-of-the-art streaming neural audio codec. Mimi operates at 12.5 Hz, and compresses +24 kHz audio down to 1.1 kbps, in a fully streaming manner (latency of 80ms, the frame size), yet performs better than existing, non-streaming, codec. This is the PyTorch implementation for Moshi and Mimi. @@ -40,12 +40,13 @@ python -m moshi.server [--gradio-tunnel] And then access the web UI on [localhost:8998](http://localhost:8998). If your GPU is on a distant machine with no direct access, `--gradio-tunnel` will create a tunnel with a URL accessible from anywhere. Keep in mind that this tunnel goes through the US and can add significant latency (up to 500ms from Europe). -You can use `--gradio-tunnel-token` to set a fixed secret and reuse the same address over time. +You can use `--gradio-tunnel-token` to set a fixed secret token and reuse the same address over time. Alternatively, you might want to use SSH to redirect your connection. You can use `--hf-repo` to select a different pretrained model, by setting the proper Hugging Face repository. +See [the model list](https://github.com/kyutai-labs/moshi?tab=readme-ov-file#models) for a reference of the available models. -Accessing a server that is not localhost via http may cause issues around using +Accessing a server that is not localhost via http may cause issues with using the microphone in the web UI (in some browsers this is only allowed using https). @@ -53,11 +54,11 @@ A local client is also available, as ```bash python -m moshi.client [--url URL_TO_GRADIO] ``` -However note, that unlike the web browser, this client is bare bone. It doesn't do any echo cancellation, +However note, that unlike the web browser, this client is barebone. It does not perform any echo cancellation, nor does it try to compensate for a growing lag by skipping frames. -## API - Mimi +## API You can use programmatically the Mimi/Moshi as follows: ```python @@ -68,7 +69,7 @@ from moshi.models import loaders mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME) mimi = loaders.get_mimi(mimi_weight, device='cpu') -mimi.set_num_codebooks(8) # up to 32. +mimi.set_num_codebooks(8) # up to 32 for mimi, but limited to 8 for moshi. wav = torch.randn(1, 1, 24000 * 10) # should be [B, C=1, T] with torch.no_grad(): @@ -111,7 +112,7 @@ pre-commit install Once locally installed, Mimi can be tested with the following command, from **the root** of the repository, ```bash wget https://github.com/metavoiceio/metavoice-src/raw/main/assets/bria.mp3 -python scripts/mimi_test.py +python scripts/mimi_streaming_test.py ``` @@ -141,5 +142,5 @@ If you use either Mimi or Moshi, please cite the following paper, } ``` -[moshi]: https://arxiv.org/ +[moshi]: https://kyutai.org/Moshi.pdf [main_repo]: https://github.com/kyutai-labs/moshi diff --git a/moshi_mlx/README.md b/moshi_mlx/README.md index 73dca2c..f30c67f 100644 --- a/moshi_mlx/README.md +++ b/moshi_mlx/README.md @@ -3,8 +3,8 @@ See the [top-level README.md][main_repo] for more information on Moshi. [Moshi][moshi] is a speech-text foundation model and full-duplex spoken dialogue framework. -It uses [Mimi][moshi], a state-of-the-art streaming neural audio codec. Mimi operates at 12.5 Hz, and compress -audio down to 1.1 kbps, in a fully streaming manner (latency of 80ms, the frame size), yet performs better than existing, non-streaming, codec. +It uses [Mimi][moshi], a state-of-the-art streaming neural audio codec. Mimi operates at a framerate of 12.5 Hz, and compress +24 kHz audio down to 1.1 kbps, in a fully streaming manner (latency of 80ms, the frame size), yet performs better than existing, non-streaming, codec. This is the MLX implementation for Moshi. For Mimi, this uses our Rust based implementation through the Python binding provided in `rustymimi`, available in the [rust/](https://github.com/kyutai-labs/moshi/tree/main/rust) folder of our main repository. @@ -19,21 +19,31 @@ pip install -e "git+https://git@github.com/kyutai-labs/moshi#egg=moshi_mlx&subdi ``` We have tested the MLX version with MacBook Pro M3. +If you get an error when installing `moshi_mlx` or `rustymimi` (which `moshi_mlx` depends on), +you might need to install the [Rust toolchain](https://rustup.rs/) to install `rustymimi` from sources. + ## Usage -Then the model can be run with: +Once you have installed `moshi_mlx`, you can run ```bash python -m moshi_mlx.local -q 4 # weights quantized to 4 bits python -m moshi_mlx.local -q 8 # weights quantized to 8 bits +# And using a different pretrained model: +python -m moshi_mlx.local -q 4 --hf-repo kmhf/moshika-mlx-q4 +python -m moshi_mlx.local -q 8 --hf-repo kmhf/moshika-mlx-q8 +# be careful to always match the `-q` and `--hf-repo` flag. ``` -This uses a command line interface, which is bare bone. It doesn't do any echo cancellation, +This uses a command line interface, which is barebone. It does not perform any echo cancellation, nor does it try to compensate for a growing lag by skipping frames. +You can use `--hf-repo` to select a different pretrained model, by setting the proper Hugging Face repository. +See [the model list](https://github.com/kyutai-labs/moshi?tab=readme-ov-file#models) for a reference of the available models. + Alternatively you can use `python -m moshi_mlx.local_web` to use -the web UI, connection is via http on [localhost:8998](http://localhost:8998). +the web UI, the connection is via http, at [localhost:8998](http://localhost:8998). ## License @@ -47,12 +57,12 @@ If you use either Mimi or Moshi, please cite the following paper, ``` @article{defossez2024moshi, title={Moshi: a speech-text foundation model for real-time dialogue}, - author={Alexandre Défossez and Laurent Mazaré and Manu Orsini and Amélie Royer and + author={Alexandre Défossez and Laurent Mazaré and Manu Orsini and Amélie Royer and Patrick Pérez and Hervé Jégou and Edouard Grave and Neil Zeghidour}, journal={arXiv:TBC}, year={2024}, } ``` -[moshi]: https://arxiv.org/ +[moshi]: https://kyutai.org/Moshi.pdf [main_repo]: https://github.com/kyutai-labs/moshi diff --git a/scripts/moshi_benchmark.py b/scripts/moshi_benchmark.py index a70b984..081e9d0 100644 --- a/scripts/moshi_benchmark.py +++ b/scripts/moshi_benchmark.py @@ -24,7 +24,7 @@ parser.add_argument("--mimi-weight", type=str, default=loaders.MIMI_V0_1, help="Name of the Mimi checkpoint in the given HF repo, or path to a local file.") parser.add_argument("--hf-repo", type=str, default=loaders.HF_REPO, - help="HF repo to look into, defaults to Kyutai official one.") + help="HF repo to look into, defaults to Kyutai's official one.") parser.add_argument("--steps", default=100, type=int) parser.add_argument("--profile", action="store_true") parser.add_argument("--device", type=str, default='cuda') diff --git a/scripts/test_mimi.py b/scripts/test_mimi.py index 5aefdfa..be63e1c 100644 --- a/scripts/test_mimi.py +++ b/scripts/test_mimi.py @@ -5,8 +5,6 @@ import argparse import numpy as np import time -from pathlib import Path -import sentencepiece import rustymimi @@ -18,7 +16,7 @@ def main(): args = parser.parse_args() steps = args.steps - model = mimi.Tokenizer(str(args.model)) + model = rustymimi.Tokenizer(str(args.model)) print(model) start_time = 0