Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Litgpt serve API #1299

Merged
merged 21 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ litgpt chat \
### Continue pretraining an LLM
This is another way of finetuning that specialize an already pretrained model by training on custom data:

```
```bash
mkdir -p custom_texts
curl https://www.gutenberg.org/cache/epub/24440/pg24440.txt --output custom_texts/book1.txt
curl https://www.gutenberg.org/cache/epub/26393/pg26393.txt --output custom_texts/book2.txt
Expand All @@ -166,6 +166,28 @@ litgpt chat \
--checkpoint_dir out/custom-model/final
```

### Deploy an LLM

This example illustrates how to deploy an LLM using LitGPT

```bash
# 1) Download a pretrained model (alternatively, use your own finetuned model)
litgpt download --repo_id microsoft/phi-2

# 2) Start the server
litgpt serve --checkpoint_dir checkpoints/microsoft/phi-2
```

```python
# 3) Use the server (in a separate session)
import requests, json
response = requests.post(
"http://127.0.0.1:8000/predict",
json={"prompt": "Fix typos in the following sentence: Exampel input"}
)
print(response.content)
```

 

> [!NOTE]
Expand Down
3 changes: 3 additions & 0 deletions litgpt/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from litgpt.scripts.download import download_from_hub as download_fn
from litgpt.scripts.merge_lora import merge_lora as merge_lora_fn
from litgpt.eval.evaluate import convert_and_evaluate as evaluate_fn
from litgpt.deploy.serve import run_server as serve_fn


if TYPE_CHECKING:
from jsonargparse import ArgumentParser
Expand Down Expand Up @@ -80,6 +82,7 @@ def main() -> None:
},
"merge_lora": {"help": "Merges the LoRA weights with the base model.", "fn": merge_lora_fn},
"evaluate": {"help": "Evaluate a model with the LM Evaluation Harness.", "fn": evaluate_fn},
"serve": {"help": "Serve and deploy a model with LitServe.", "fn": serve_fn},
}

from jsonargparse import set_config_read_mode, set_docstring_parse_options
Expand Down
132 changes: 132 additions & 0 deletions litgpt/deploy/serve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from pathlib import Path
from typing import Dict, Any, Optional
from litgpt.utils import check_valid_checkpoint_dir

import lightning as L
import torch
from litserve import LitAPI, LitServer

from litgpt.model import GPT
from litgpt.config import Config
from litgpt.tokenizer import Tokenizer
from litgpt.generate.base import generate
from litgpt.prompts import load_prompt_style, has_prompt_style, PromptStyle
from litgpt.utils import load_checkpoint, CLI


class SimpleLitAPI(LitAPI):
def __init__(self,
checkpoint_dir: Path,
precision: Optional[str] = None,
temperature: float = 0.8,
top_k: int = 200,
rasbt marked this conversation as resolved.
Show resolved Hide resolved
max_generated_tokens: int = 30) -> None:
rasbt marked this conversation as resolved.
Show resolved Hide resolved

super().__init__()
self.checkpoint_dir = checkpoint_dir
self.precision = precision
self.temperature = temperature
self.top_k = top_k
self.max_generated_tokens = max_generated_tokens

def setup(self, device: str) -> None:
# Setup the model so it can be called in `predict`.
config = Config.from_file(self.checkpoint_dir / "model_config.yaml")
device = torch.device(device)
torch.set_float32_matmul_precision("high")
fabric = L.Fabric(
accelerator=device.type,
devices=[device.index],
precision=self.precision
)
checkpoint_path = self.checkpoint_dir / "lit_model.pth"
self.tokenizer = Tokenizer(self.checkpoint_dir)
self.prompt_style = (
load_prompt_style(self.checkpoint_dir)
if has_prompt_style(self.checkpoint_dir)
else PromptStyle.from_config(config)
)
with fabric.init_module(empty_init=True):
model = GPT(config)
with fabric.init_tensor():
# enable the kv cache
model.set_kv_cache(batch_size=1)
model.eval()

self.model = fabric.setup_module(model)
load_checkpoint(fabric, self.model, checkpoint_path)
self.device = fabric.device

def decode_request(self, request: Dict[str, Any]) -> Any:
# Convert the request payload to your model input.
prompt = request["prompt"]
prompt = self.prompt_style.apply(prompt)
encoded = self.tokenizer.encode(prompt, device=self.device)
return encoded

def predict(self, inputs: torch.Tensor) -> Any:
# Run the model on the input and return the output.
prompt_length = inputs.size(0)
max_returned_tokens = prompt_length + self.max_generated_tokens

y = generate(
self.model,
inputs,
max_returned_tokens,
temperature=self.temperature,
top_k=self.top_k,
eos_id=self.tokenizer.eos_id
)

for block in self.model.transformer.h:
block.attn.kv_cache.reset_parameters()
return y
rasbt marked this conversation as resolved.
Show resolved Hide resolved

def encode_response(self, output: torch.Tensor) -> Dict[str, Any]:
# Convert the model output to a response payload.
decoded_output = self.tokenizer.decode(output)
return {"output": decoded_output}


def run_server(
checkpoint_dir: Path = Path("checkpoints"),
precision: Optional[str] = None,
rasbt marked this conversation as resolved.
Show resolved Hide resolved
temperature: float = 0.8,
top_k: int = 200,
max_generated_tokens: int = 30,
devices: int = 1,
accelerator: str = "cuda",
rasbt marked this conversation as resolved.
Show resolved Hide resolved
port: int = 8000
) -> None:
"""Serve a LitGPT model using LitServe

Arguments:
checkpoint_dir: The checkpoint directory to load the model from.
precision: Optional precision setting to instantiate the model weights in. By default, this will
automatically be inferred from the metadata in the given ``checkpoint_dir`` directory.
temperature: Temperature setting for the text generation. Value above 1 increase randomness.
Values below 1 decrease randomness.
top_k: The size of the pool of potential next tokens. Values larger than 1 result in more novel
generated text but can also lead to more incoherent texts.
max_generated_tokens: How many new tokens, in addition to the prompt length, to generate.
devices: How many devices/GPUs to use.
accelerator: The type of accelerator to use. For example, "cuda" or "cpu".
port: The network port number on which the model is configured to be served.
"""
check_valid_checkpoint_dir(checkpoint_dir, model_filename="lit_model.pth")

server = LitServer(
SimpleLitAPI(
checkpoint_dir, precision,
temperature=temperature,
top_k=top_k,
max_generated_tokens=max_generated_tokens,
),
accelerator=accelerator,
devices=devices)

server.run(port=port)


if __name__ == "__main__":
CLI(run_server)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies = [
"torch>=2.2.0",
"lightning==2.3.0.dev20240328",
"jsonargparse[signatures]>=4.27.6",
"litserve", # imported by litgpt.deploy
rasbt marked this conversation as resolved.
Show resolved Hide resolved
]

[project.urls]
Expand Down
37 changes: 37 additions & 0 deletions tutorials/0_to_litgpt.md
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,43 @@ litgpt evaluate \
(A list of supported tasks can be found [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md).)


 
## Deploy LLMs

You can deploy LitGPT LLMs using your tool of choice. Below is an example using LitGPT built-in serving capabilities:


```bash
# 1) Download a pretrained model (alternatively, use your own finetuned model)
litgpt download --repo_id microsoft/phi-2

# 2) Start the server
litgpt serve --checkpoint_dir checkpoints/microsoft/phi-2
```

```python
# 3) Use the server (in a separate session)
import requests, json
response = requests.post(
"http://127.0.0.1:8000/predict",
json={"prompt": "Fix typos in the following sentence: Exampel input"}
)
print(response.content)
```

This prints:

```
b'{"output":"Instruct: Fix typos in the following sentence: Exampel input\\nOutput: Example input: Hello World\\n"}'
```


 
**More information and additional resources**

- [tutorials/deploy](deploy.md): A full deployment tutorial and example


 
## Converting LitGPT model weights to `safetensors` format

Expand Down
51 changes: 51 additions & 0 deletions tutorials/deploy.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Serve and Deploy LLMs

This document shows how you can serve a LitGPT for deployment.

 
## Serve an LLM

This section illustrates how we can set up an inference server for a phi-2 LLM using `litgpt serve` that is minimal and highly scalable.


 
## Step 1: Start the inference server


```bash
# 1) Download a pretrained model (alternatively, use your own finetuned model)
litgpt download --repo_id microsoft/phi-2

# 2) Start the server
litgpt serve --checkpoint_dir checkpoints/microsoft/phi-2
```

> [!TIP]
> Use `litgpt serve --help` to display additional options, including the port, devices, LLM temperature setting, and more.


 
## Step 2: Query the inference server

You can now send requests to the inference server you started in step 2. For example, in a new Python session, we can send requests to the inference server as follows:


```python
import requests, json

response = requests.post(
"http://127.0.0.1:8000/predict",
json={"prompt": "Fix typos in the following sentence: Exampel input"}
)

decoded_string = response.content.decode("utf-8")
output_str = json.loads(decoded_string)["output"]
print(output_str)
```

Executing the code above prints the following output:

```
Instruct: Fix typos in the following sentence: Exampel input
Output: Example input.
```
Loading