Skip to content

Commit

Permalink
Merge branch 'vllm'
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Feb 16, 2024
2 parents dac04ab + 64cc30e commit 06315a4
Show file tree
Hide file tree
Showing 22 changed files with 426 additions and 391 deletions.
8 changes: 3 additions & 5 deletions .devcontainer/Dockerfile-vllm
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# syntax = edrevo/dockerfile-plus
# ^^^ this line enables the INCLUDE+ directive

FROM nvcr.io/nvidia/pytorch:23.09-py3
FROM nvcr.io/nvidia/pytorch:23.10-py3

INCLUDE+ cuda-settings.dockerfile
INCLUDE+ common.dockerfile
Expand All @@ -15,11 +15,9 @@ RUN pip install -r /tmp/requirements.txt
# RUN pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers

# takes forever!
RUN pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
# RUN pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
# RUN pip install typing_extensions==4.5.0

RUN pip install -U flash-attn

# RUN pip install -U flash-attn

# RUN pip install torch==2.1.0 nvidia-cuda-runtime
# the .so file seems to be missing
Expand Down
40 changes: 31 additions & 9 deletions .devcontainer/vllm-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,19 +1,25 @@
# vllm: requirements.txt
ninja # For faster builds.
psutil
ray >= 2.5.1
pandas # Required for Ray data.
ray >= 2.9
sentencepiece # Required for LLaMA tokenizer.
numpy
torch == 2.1.0
transformers >= 4.33.1 # Required for Code Llama.
xformers >= 0.0.21
torch == 2.1.2
transformers >= 4.37.0 # Required for Qwen2
xformers == 0.0.23.post1 # Required for CUDA 12.1.
fastapi
uvicorn
pydantic < 2 # Required for OpenAI server.
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
aioprometheus[starlette]
pynvml == 11.5.0
triton >= 2.1.0
cupy-cuda12x == 12.3.0 # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.

# vllm: requirements-dev.txt
# formatting
yapf==0.32.0
pylint==2.8.2
toml==0.10.2
ruff==0.1.5

# type checking
mypy==0.991
Expand All @@ -24,8 +30,24 @@ types-setuptools
# testing
pytest
pytest-forked
pytest-asyncio
httpx
einops # required for MPT
flash_attn # required for HuggingFace's llama implementation
openai
requests
# ray - XXX

# vllm: requirements-build.txt
# Should be mirrored in pyproject.toml
ninja
packaging
setuptools>=49.4.0
# torch==2.1.2 - XXX
wheel

# non-vllm:
ujson
posix_ipc
accelerate
fschat
fschat
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
"rust"
],
"cSpell.words": [
"actix",
"aici",
"aicirt",
"avgtol",
Expand Down
12 changes: 12 additions & 0 deletions aicirt/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,18 @@ impl<T> SequenceResult<T> {
micros: self.micros,
}
}
pub fn map_result<S, F>(self, f: F) -> SequenceResult<S>
where
F: FnOnce(T) -> S,
{
SequenceResult {
error: self.error,
result: self.result.map(f),
storage: self.storage,
logs: self.logs,
micros: self.micros,
}
}
}

#[derive(Serialize, Deserialize)]
Expand Down
61 changes: 58 additions & 3 deletions aicirt/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@ pub mod shm;
#[cfg(target_os = "macos")]
mod macos;

use std::fmt::Write;

use anyhow::Result;
pub use bench::*;
use flexi_logger::style;
use flexi_logger::{DeferredNow, Logger, WriteMode};
use log::Record;
use thread_priority::{
Expand All @@ -27,23 +30,75 @@ pub enum LogMode {
Daemon,
}

struct LimitedWrite {
limit: usize,
dst: String,
}

impl Write for LimitedWrite {
fn write_str(&mut self, s: &str) -> std::fmt::Result {
if self.dst.len() > self.limit {
return Err(std::fmt::Error);
}
if self.dst.len() + s.len() < self.limit {
self.dst.push_str(s);
Ok(())
} else {
let remaining = self.limit - self.dst.len();
self.dst.push_str(&s[..remaining]);
self.dst.push_str(" (...)");
Err(std::fmt::Error)
}
}
}

fn args_to_str(limit: usize, args: &std::fmt::Arguments) -> String {
// let capacity = args.estimated_capacity();
let mut output = LimitedWrite {
limit,
dst: String::with_capacity(128),
};
if output.write_fmt(*args).is_err() {
assert!(output.dst.len() > limit);
}
output.dst
}

fn truncated_format(
w: &mut dyn std::io::Write,
_now: &mut DeferredNow,
record: &Record,
) -> Result<(), std::io::Error> {
let level = record.level();
write!(
w,
"{} [{}] {}",
style(level).paint(level.to_string()),
record.module_path().unwrap_or("<unnamed>"),
style(level).paint(args_to_str(1000, record.args()))
)
}

fn daemon_format(
w: &mut dyn std::io::Write,
now: &mut DeferredNow,
record: &Record,
) -> Result<(), std::io::Error> {
write!(
w,
"[{}] {} {}",
"{} {} [{}] {}",
now.format("%Y-%m-%d %H:%M:%S%.3f"),
record.level(),
&record.args()
record.module_path().unwrap_or("<unnamed>"),
args_to_str(5000, record.args())
)
}

pub fn init_log(mode: LogMode) -> Result<()> {
let logger = match mode {
LogMode::Normal => Logger::try_with_env_or_str("info")?.log_to_stdout(),
LogMode::Normal => Logger::try_with_env_or_str("info")?
.format(truncated_format)
.log_to_stdout(),
LogMode::Test => {
Logger::try_with_env_or_str("debug")?.write_mode(WriteMode::SupportCapture)
}
Expand Down
30 changes: 20 additions & 10 deletions aicirt/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -476,8 +476,16 @@ impl ModuleRegistry {
})
.collect::<Vec<_>>();

ensure_user!(wasm_files.len() > 0, "no wasm files found (selector={:?})", selector);
ensure_user!(wasm_files.len() == 1, "too many wasm files found (selector={:?})", selector);
ensure_user!(
wasm_files.len() > 0,
"no wasm files found (selector={:?})",
selector
);
ensure_user!(
wasm_files.len() == 1,
"too many wasm files found (selector={:?})",
selector
);

let wasm_file = wasm_files[0];
let upd = wasm_file["updated_at"]
Expand Down Expand Up @@ -700,15 +708,17 @@ impl Stepper {
}
outputs.insert(
id,
data.json.clone_with(Some(AiciPreProcessResultInner {
suspend: data.suspend,
num_forks: data.num_forks,
ff_tokens: data.ff_tokens,
})),
data.map_result(|pp| {
if pp.suspend {
assert!(pp.num_forks == 1);
}
AiciPreProcessResultInner {
suspend: pp.suspend,
num_forks: pp.num_forks,
ff_tokens: pp.ff_tokens,
}
}),
);
if data.suspend {
assert!(data.num_forks == 1);
}
}
Err(e) => self.worker_error(id, &mut outputs, e),
}
Expand Down
Loading

0 comments on commit 06315a4

Please sign in to comment.