Merge pull request #28 from kyutai-labs/refacto

refacto [1/N]
kyutai-labs · Sep 18, 2024 · 5112385 · 5112385
2 parents bdf176d + 811c885
commit 5112385
Show file tree

Hide file tree

Showing 20 changed files with 324 additions and 246 deletions.
diff --git a/moshi/README.md b/moshi/README.md
@@ -0,0 +1 @@
+# moshi - pytorch
diff --git a/moshi/moshi/client.py b/moshi/moshi/client.py
@@ -1,16 +1,17 @@
 # Copyright (c) Kyutai, all rights reserved.
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+"""Client for the Moshi server."""
 
 import argparse
 import asyncio
 import queue
 import sys
 
+import aiohttp
 import numpy as np
 import sphn
 import sounddevice as sd
-import aiohttp
 
 from .client_utils import AnyPrinter, Printer, RawPrinter
 
@@ -141,7 +142,27 @@ async def run(self) -> None:
 
 
 async def run(printer: AnyPrinter, args):
-    uri = f"ws://{args.host}:{args.port}/api/chat"
+    if args.url is None:
+        proto = "ws"
+        if args.https:
+            proto += "s"
+        uri = f"{proto}://{args.host}:{args.port}/api/chat"
+    else:
+        proto = "wss"
+        if '://' in args.url:
+            proto, without_proto = args.url.split('://', 1)
+            if proto in ['ws', 'http']:
+                proto = "ws"
+            elif proto in ['wss', 'https']:
+                proto = "wss"
+            else:
+                printer.log("error", "The provided URL {args.url} seems to contain a protocol but it is unknown.")
+                sys.exit(1)
+        else:
+            without_proto = args.url
+        uri = f"{proto}://{without_proto}/api/chat"
+
+    printer.log("info", "Connecting to {uri}.")
     async with aiohttp.ClientSession() as session:
         async with session.ws_connect(uri) as ws:
             printer.log("info", "connected!")
@@ -152,8 +173,11 @@ async def run(printer: AnyPrinter, args):
 
 def main():
     parser = argparse.ArgumentParser("client_opus")
-    parser.add_argument("--host", default="localhost", type=str)
-    parser.add_argument("--port", default=8998, type=int)
+    parser.add_argument("--host", default="localhost", type=str, help="Hostname to connect to.")
+    parser.add_argument("--port", default=8998, type=int, help="Port to connect to.")
+    parser.add_argument("--https", action='store_true',
+                        help="Set this flag for using a https connection.")
+    parser.add_argument("--url", type=str, help='Provides directly a URL, e.g. to a gradio tunnel.')
     args = parser.parse_args()
     printer: AnyPrinter
 

diff --git a/moshi/moshi/client_utils.py b/moshi/moshi/client_utils.py
@@ -1,6 +1,8 @@
 # Copyright (c) Kyutai, all rights reserved.
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+"""Utilities for the command line client, in particular for handling interactions with the terminal.
+"""
 
 from dataclasses import dataclass
 import sys
@@ -14,11 +16,11 @@ def colorize(text, color):
 
 def make_log(level: str, msg: str) -> str:
     if level == "warning":
-        prefix = colorize("Warning:", "1;31")
+        prefix = colorize("[Warn]", "1;31")
     elif level == "info":
-        prefix = colorize("Info:", "1;34")
+        prefix = colorize("[Info]", "1;34")
     elif level == "error":
-        prefix = colorize("Error:", "1;31")
+        prefix = colorize("[Err ]", "1;31")
     else:
         raise ValueError(f"Unknown level {level}")
     return prefix + " " + msg

diff --git a/moshi/moshi/models/__init__.py b/moshi/moshi/models/__init__.py
@@ -1,20 +1,14 @@
 # Copyright (c) Kyutai, all rights reserved.
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
 """
-Models for EnCodec, AudioGen, MusicGen, as well as the generic LMModel.
+Models for the compression model Moshi,
 """
 
 # flake8: noqa
-from .encodec import (
+from .compression import (
     CompressionModel,
-    EncodecModel,
+    MimiModel,
 )
 from .lm import LMModel, LMGen
-from .moshi_ import get_encodec, get_lm
+from .loaders import get_mimi, get_moshi_lm
diff --git a/moshi/moshi/models/compression.py b/moshi/moshi/models/compression.py
@@ -2,13 +2,15 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
+# Part of this file is adapted from encodec.py in https://github.com/facebookresearch/audiocraft
+# released under the following license.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-"""Compression models or wrapper around existing models.
-Also defines the main interface that a model must follow to be usable as an audio tokenizer.
+"""Compression models or wrapper around existing models. In particular, provides the implementation
+for Mimi. Also defines the main interface that a model must follow to be usable as an audio tokenizer.
 """
 
 from abc import abstractmethod
@@ -19,7 +21,6 @@
 
 import torch
 from torch import nn
-from torch.nn import functional as F
 
 
 from ..quantization import (
@@ -46,12 +47,12 @@ def forward(self, x: torch.Tensor) -> QuantizedResult: ...
 
     @abstractmethod
     def encode(self, x: torch.Tensor) -> torch.Tensor:
-        """See `EncodecModel.encode`."""
+        """See `MimiModel.encode`."""
         ...
 
     @abstractmethod
     def decode(self, codes: torch.Tensor) -> torch.Tensor:
-        """See `EncodecModel.decode`."""
+        """See `MimiModel.decode`."""
         ...
 
     @abstractmethod
@@ -90,16 +91,16 @@ def set_num_codebooks(self, n: int):
 
 
 @dataclass
-class _EncodecState:
+class _MimiState:
     graphed_tr_enc: CUDAGraphed | None
     graphed_tr_dec: CUDAGraphed | None
 
     def reset(self):
         pass
 
 
-class EncodecModel(CompressionModel[_EncodecState]):
-    """Encodec model operating on the raw waveform.
+class MimiModel(CompressionModel[_MimiState]):
+    """Mimi model operating on the raw waveform.
 
     Args:
         encoder (nn.Module): Encoder network.
@@ -122,6 +123,7 @@ class EncodecModel(CompressionModel[_EncodecState]):
         torch_compile_encoder_decoder (bool): if True, uses torch.compile on the encoder / decoder.
             Deactivated by default for training as this is incompatible at the moment with weight norm.
             See https://github.com/pytorch/pytorch/issues/121902
+            Also this seems to work well with 2.2.0, but completely fail with 2.4.0.
     """
 
     def __init__(
@@ -217,14 +219,16 @@ def __init__(
                     channel_wise=upsample_channel_wise_bug,
                 )
 
-    def _init_streaming_state(self, batch_size: int) -> _EncodecState:
+    def _init_streaming_state(self, batch_size: int) -> _MimiState:
+        device = next(self.parameters()).device
+        disable = device.type != 'cuda'
         graphed_tr_dec = None
         graphed_tr_enc = None
         if self.encoder_transformer is not None:
-            graphed_tr_enc = CUDAGraphed(self.encoder_transformer)
+            graphed_tr_enc = CUDAGraphed(self.encoder_transformer, disable=disable)
         if self.decoder_transformer is not None:
-            graphed_tr_dec = CUDAGraphed(self.decoder_transformer)
-        return _EncodecState(graphed_tr_enc, graphed_tr_dec)
+            graphed_tr_dec = CUDAGraphed(self.decoder_transformer, disable=disable)
+        return _MimiState(graphed_tr_enc, graphed_tr_dec)
 
     @property
     def channels(self) -> int:
@@ -368,7 +372,8 @@ def encode(self, x: torch.Tensor) -> torch.Tensor:
             x (torch.Tensor): Float tensor of shape [B, C, T]
 
         Returns:
-            codes (torch.Tensor): an int tensor of shape [B, K, T] with K the number of codebooks used and T the timestep.
+            codes (torch.Tensor): an int tensor of shape [B, K, T]
+                with K the number of codebooks used and T the timestep.
         """
         emb = self._encode_to_unquantized_latent(x)
         codes = self.quantizer.encode(emb)

diff --git a/moshi/moshi/models/lm.py b/moshi/moshi/models/lm.py
@@ -372,8 +372,9 @@ def _init_streaming_state(self, batch_size: int) -> _LMGenState:
             dtype=torch.long,
         )
 
-        graphed_main = CUDAGraphed(lm_model.forward_text)
-        graphed_depth = CUDAGraphed(self.depformer_step)
+        disable = lm_model.device.type != 'cuda'
+        graphed_main = CUDAGraphed(lm_model.forward_text, disable=disable)
+        graphed_depth = CUDAGraphed(self.depformer_step, disable=disable)
 
         return _LMGenState(cache, initial, graphed_main, graphed_depth)
 

diff --git a/moshi/moshi/models/loaders.py b/moshi/moshi/models/loaders.py
@@ -1,21 +1,29 @@
 # Copyright (c) Kyutai, all rights reserved.
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+"""Retrieves the pretrained models for Moshi and Mimi."""
+from pathlib import Path
 
-from ..modules import SEANetEncoder, SEANetDecoder, transformer
-from .encodec import EncodecModel
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_model
+import sentencepiece
+import torch
+
+from .compression import MimiModel
 from .lm import LMModel
+from ..modules import SEANetEncoder, SEANetDecoder, transformer
 from ..quantization import SplitResidualVectorQuantizer
-import torch
-from safetensors.torch import load_model
-from pathlib import Path
-import typing as tp
 
 SAMPLE_RATE = 24000
 FRAME_RATE = 12.5
+HF_REPO = 'kmhf/msh-v0.1'
+MIMI_V0_1 = 'tokenizer-e351c8d8-checkpoint125.safetensors'
+MOSHIKO_V0_1 = '[email protected]'
+MOSHIKA_V0_1 = '[email protected]'
+TEXT_TOKENIZER_V0_1 = 'tokenizer_spm_32k_3.model'
 
 
-seanet_kwargs = {
+_seanet_kwargs = {
     "channels": 1,
     "dimension": 512,
     "causal": True,
@@ -35,15 +43,15 @@
     "ratios": [8, 6, 5, 4],
     "true_skip": True,
 }
-quantizer_kwargs = {
+_quantizer_kwargs = {
     "dimension": 256,
     "n_q": 32,
     "bins": 2048,
-    "input_dimension": seanet_kwargs["dimension"],
-    "output_dimension": seanet_kwargs["dimension"],
+    "input_dimension": _seanet_kwargs["dimension"],
+    "output_dimension": _seanet_kwargs["dimension"],
 }
-transformer_kwargs = {
-    "d_model": seanet_kwargs["dimension"],
+_transformer_kwargs = {
+    "d_model": _seanet_kwargs["dimension"],
     "num_heads": 8,
     "num_layers": 8,
     "causal": True,
@@ -55,17 +63,17 @@
     "norm": "layer_norm",
     "positional_embedding": "rope",
     "dim_feedforward": 2048,
-    "input_dimension": seanet_kwargs["dimension"],
-    "output_dimensions": [seanet_kwargs["dimension"]],
+    "input_dimension": _seanet_kwargs["dimension"],
+    "output_dimensions": [_seanet_kwargs["dimension"]],
 }
 
-lm_kwargs = {
+_lm_kwargs = {
     "dim": 4096,
     "text_card": 32000,
     "existing_text_padding_id": 3,
     "n_q": 16,
     "dep_q": 8,
-    "card": quantizer_kwargs["bins"],
+    "card": _quantizer_kwargs["bins"],
     "num_heads": 32,
     "num_layers": 32,
     "hidden_scale": 4.125,
@@ -92,24 +100,40 @@
 }
 
 
-def _is_safetensors(filename: tp.Union[str, Path]) -> bool:
-    filename = Path(filename)
-    return filename.suffix in (".safetensors", ".sft", ".sfts")
+def _is_safetensors(path: Path | str) -> bool:
+    return Path(path).suffix in (".safetensors", ".sft", ".sfts")
 
 
-def get_encodec(filename: tp.Union[str, Path], device):
-    encoder = SEANetEncoder(**seanet_kwargs)
-    decoder = SEANetDecoder(**seanet_kwargs)
+def resolve_model_checkpoint(name: str, hf_repo: str = HF_REPO, allow_local_file: bool = True) -> Path:
+    """Load a model checkpoint from HF.
+    If `allow_local_file` is True, then if a file `name` exists, it will be used instead.
+    """
+    if allow_local_file and Path(name).exists():
+        return Path(name)
+    else:
+        filename = name
+    return Path(hf_hub_download(hf_repo, filename))
+
+
+def get_text_tokenizer(filename: str | Path) -> sentencepiece.SentencePieceProcessor:
+    return sentencepiece.SentencePieceProcessor(str(filename))  # type: ignore
+
+
+def get_mimi(filename: str | Path,
+             device: torch.device | str = 'cpu') -> MimiModel:
+    """Return a pretrained Mimi model."""
+    encoder = SEANetEncoder(**_seanet_kwargs)
+    decoder = SEANetDecoder(**_seanet_kwargs)
     encoder_transformer = transformer.ProjectedTransformer(
-        device=device, **transformer_kwargs
+        device=device, **_transformer_kwargs
     )
     decoder_transformer = transformer.ProjectedTransformer(
-        device=device, **transformer_kwargs
+        device=device, **_transformer_kwargs
     )
     quantizer = SplitResidualVectorQuantizer(
-        **quantizer_kwargs,
+        **_quantizer_kwargs,
     )
-    model = EncodecModel(
+    model = MimiModel(
         encoder,
         decoder,
         quantizer,
@@ -126,21 +150,19 @@ def get_encodec(filename: tp.Union[str, Path], device):
     if _is_safetensors(filename):
         load_model(model, filename)
     else:
-        pkg = torch.load(
-            filename,
-            "cpu",
-        )
+        pkg = torch.load(filename, "cpu")
         model.load_state_dict(pkg["model"])
     model.set_num_codebooks(8)
     return model
 
 
-def get_lm(filename: tp.Union[str, Path], device):
+def get_moshi_lm(filename: str | Path,
+                 device: torch.device | str = 'cpu') -> LMModel:
     dtype = torch.bfloat16
     model = LMModel(
         device=device,
         dtype=dtype,
-        **lm_kwargs,
+        **_lm_kwargs,
     ).to(device=device, dtype=dtype)
     model.eval()
     if _is_safetensors(filename):