Lightning-AI · rasbt · Apr 17, 2024 · Apr 15, 2024 · Apr 15, 2024 · Apr 15, 2024
@@ -144,7 +144,7 @@ litgpt chat \
 ### Continue pretraining an LLM       
 This is another way of finetuning that specialize an already pretrained model by training on custom data:    
 
-```
+```bash
 mkdir -p custom_texts
 curl https://www.gutenberg.org/cache/epub/24440/pg24440.txt --output custom_texts/book1.txt
 curl https://www.gutenberg.org/cache/epub/26393/pg26393.txt --output custom_texts/book2.txt
@@ -166,6 +166,28 @@ litgpt chat \
   --checkpoint_dir out/custom-model/final
 ```
 
+### Deploy an LLM
+
+This example illustrates how to deploy an LLM using LitGPT
+
+```bash
+# 1) Download a pretrained model (alternatively, use your own finetuned model)
+litgpt download --repo_id microsoft/phi-2
+
+# 2) Start the server
+litgpt serve --checkpoint_dir checkpoints/microsoft/phi-2
+```
+
+```python
+# 3) Use the server (in a separate session)
+import requests, json
+ response = requests.post(
+     "http://127.0.0.1:8000/predict", 
+     json={"prompt": "Fix typos in the following sentence: Exampel input"}
+)
+print(response.content)
+```
+
 &nbsp;
 
 > [!NOTE]

@@ -24,6 +24,8 @@
 from litgpt.scripts.download import download_from_hub as download_fn
 from litgpt.scripts.merge_lora import merge_lora as merge_lora_fn
 from litgpt.eval.evaluate import convert_and_evaluate as evaluate_fn
+from litgpt.deploy.serve import run_server as serve_fn
+
 
 if TYPE_CHECKING:
     from jsonargparse import ArgumentParser
@@ -80,6 +82,7 @@ def main() -> None:
         },
         "merge_lora": {"help": "Merges the LoRA weights with the base model.", "fn": merge_lora_fn},
         "evaluate": {"help": "Evaluate a model with the LM Evaluation Harness.", "fn": evaluate_fn},
+        "serve": {"help": "Serve and deploy a model with LitServe.", "fn": serve_fn},
     }
 
     from jsonargparse import set_config_read_mode, set_docstring_parse_options

@@ -0,0 +1,132 @@
+from pathlib import Path
+from typing import Dict, Any, Optional
+from litgpt.utils import check_valid_checkpoint_dir
+
+import lightning as L
+import torch
+from litserve import LitAPI, LitServer
+
+from litgpt.model import GPT
+from litgpt.config import Config
+from litgpt.tokenizer import Tokenizer
+from litgpt.generate.base import generate
+from litgpt.prompts import load_prompt_style, has_prompt_style, PromptStyle
+from litgpt.utils import load_checkpoint, CLI
+
+
+class SimpleLitAPI(LitAPI):
+    def __init__(self,
+                 checkpoint_dir: Path,
+                 precision: Optional[str] = None,
+                 temperature: float = 0.8,
+                 top_k: int = 200,
+                 max_generated_tokens: int = 30) -> None:
+
+        super().__init__()
+        self.checkpoint_dir = checkpoint_dir
+        self.precision = precision
+        self.temperature = temperature
+        self.top_k = top_k
+        self.max_generated_tokens = max_generated_tokens
+
+    def setup(self, device: str) -> None:
+        # Setup the model so it can be called in `predict`.
+        config = Config.from_file(self.checkpoint_dir / "model_config.yaml")
+        device = torch.device(device)
+        torch.set_float32_matmul_precision("high")
+        fabric = L.Fabric(
+            accelerator=device.type,
+            devices=[device.index],
+            precision=self.precision
+        )
+        checkpoint_path = self.checkpoint_dir / "lit_model.pth"
+        self.tokenizer = Tokenizer(self.checkpoint_dir)
+        self.prompt_style = (
+            load_prompt_style(self.checkpoint_dir)
+            if has_prompt_style(self.checkpoint_dir)
+            else PromptStyle.from_config(config)
+        )
+        with fabric.init_module(empty_init=True):
+            model = GPT(config)
+        with fabric.init_tensor():
+            # enable the kv cache
+            model.set_kv_cache(batch_size=1)
+        model.eval()
+
+        self.model = fabric.setup_module(model)
+        load_checkpoint(fabric, self.model, checkpoint_path)
+        self.device = fabric.device
+
+    def decode_request(self, request: Dict[str, Any]) -> Any:
+        # Convert the request payload to your model input.
+        prompt = request["prompt"]
+        prompt = self.prompt_style.apply(prompt)
+        encoded = self.tokenizer.encode(prompt, device=self.device)
+        return encoded
+
+    def predict(self, inputs: torch.Tensor) -> Any:
+        # Run the model on the input and return the output.
+        prompt_length = inputs.size(0)
+        max_returned_tokens = prompt_length + self.max_generated_tokens
+
+        y = generate(
+            self.model,
+            inputs,
+            max_returned_tokens,
+            temperature=self.temperature,
+            top_k=self.top_k,
+            eos_id=self.tokenizer.eos_id
+        )
+
+        for block in self.model.transformer.h:
+            block.attn.kv_cache.reset_parameters()
+        return y
+
+    def encode_response(self, output: torch.Tensor) -> Dict[str, Any]:
+        # Convert the model output to a response payload.
+        decoded_output = self.tokenizer.decode(output)
+        return {"output": decoded_output}
+
+
+def run_server(
+    checkpoint_dir: Path = Path("checkpoints"),
+    precision: Optional[str] = None,
+    temperature: float = 0.8,
+    top_k: int = 200,
+    max_generated_tokens: int = 30,
+    devices: int = 1,
+    accelerator: str = "cuda",
+    port: int = 8000
+) -> None:
+    """Serve a LitGPT model using LitServe
+
+    Arguments:
+        checkpoint_dir: The checkpoint directory to load the model from.
+        precision: Optional precision setting to instantiate the model weights in. By default, this will
+            automatically be inferred from the metadata in the given ``checkpoint_dir`` directory.
+        temperature: Temperature setting for the text generation. Value above 1 increase randomness.
+            Values below 1 decrease randomness.
+        top_k: The size of the pool of potential next tokens. Values larger than 1 result in more novel
+            generated text but can also lead to more incoherent texts.
+        max_generated_tokens: How many new tokens, in addition to the prompt length, to generate.
+        devices: How many devices/GPUs to use.
+        accelerator: The type of accelerator to use. For example, "cuda" or "cpu".
+        port: The network port number on which the model is configured to be served.
+    """
+    check_valid_checkpoint_dir(checkpoint_dir, model_filename="lit_model.pth")
+
+    server = LitServer(
+        SimpleLitAPI(
+            checkpoint_dir, precision,
+            temperature=temperature,
+            top_k=top_k,
+            max_generated_tokens=max_generated_tokens,
+            ),
+        accelerator=accelerator,
+        devices=devices)
+
+    server.run(port=port)
+
+
+if __name__ == "__main__":
+    CLI(run_server)
@@ -12,6 +12,7 @@ dependencies = [
     "torch>=2.2.0",
     "lightning==2.3.0.dev20240328",
     "jsonargparse[signatures]>=4.27.6",
+    "litserve",   # imported by litgpt.deploy
 ]
 
 [project.urls]

@@ -464,6 +464,43 @@ litgpt evaluate \
 (A list of supported tasks can be found [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md).)
 
 
+&nbsp;
+## Deploy LLMs
+
+You can deploy LitGPT LLMs using your tool of choice. Below is an example using LitGPT built-in serving capabilities:
+
+
+```bash
+# 1) Download a pretrained model (alternatively, use your own finetuned model)
+litgpt download --repo_id microsoft/phi-2
+
+# 2) Start the server
+litgpt serve --checkpoint_dir checkpoints/microsoft/phi-2
+```
+
+```python
+# 3) Use the server (in a separate session)
+import requests, json
+ response = requests.post(
+     "http://127.0.0.1:8000/predict", 
+     json={"prompt": "Fix typos in the following sentence: Exampel input"}
+)
+print(response.content)
+```
+
+This prints:
+
+```
+b'{"output":"Instruct: Fix typos in the following sentence: Exampel input\\nOutput: Example input: Hello World\\n"}'
+```
+
+
+&nbsp;
+**More information and additional resources**
+
+- [tutorials/deploy](deploy.md): A full deployment tutorial and example
+
+
 &nbsp;
 ## Converting LitGPT model weights to `safetensors` format
 

@@ -0,0 +1,51 @@
+# Serve and Deploy LLMs
+
+This document shows how you can serve a LitGPT for deployment. 
+
+&nbsp;
+## Serve an LLM
+
+This section illustrates how we can set up an inference server for a phi-2 LLM using `litgpt serve` that is minimal and highly scalable.
+
+
+&nbsp;
+## Step 1: Start the inference server
+
+
+```bash
+# 1) Download a pretrained model (alternatively, use your own finetuned model)
+litgpt download --repo_id microsoft/phi-2
+
+# 2) Start the server
+litgpt serve --checkpoint_dir checkpoints/microsoft/phi-2
+```
+
+> [!TIP]
+> Use `litgpt serve --help` to display additional options, including the port, devices, LLM temperature setting, and more.
+
+
+&nbsp;
+## Step 2: Query the inference server
+
+You can now send requests to the inference server you started in step 2. For example, in a new Python session, we can send requests to the inference server as follows:
+
+
+```python
+import requests, json
+
+response = requests.post(
+    "http://127.0.0.1:8000/predict", 
+    json={"prompt": "Fix typos in the following sentence: Exampel input"}
+)
+
+decoded_string = response.content.decode("utf-8")
+output_str = json.loads(decoded_string)["output"]
+print(output_str)
+```
+
+Executing the code above prints the following output:
+
+```
+Instruct: Fix typos in the following sentence: Exampel input
+Output: Example input.
+```