Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
Merge branch 'main' into qwen1-5
Browse files Browse the repository at this point in the history
  • Loading branch information
intellinjun authored Mar 4, 2024
2 parents 95d0b60 + 7c2199f commit 7b83f79
Show file tree
Hide file tree
Showing 47 changed files with 3,404 additions and 405 deletions.
230 changes: 162 additions & 68 deletions developer_document.md

Large diffs are not rendered by default.

13 changes: 7 additions & 6 deletions docs/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,14 +171,15 @@ Neural Speed supports the following models:
<td>4.33.1</td>
</tr>
<tr>
<td><a href="https://huggingface.co/mistralai/Mistral-7B-v0.1" target="_blank" rel="noopener noreferrer">Mistral-7B</a></td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td><a href="https://huggingface.co/mistralai/Mistral-7B-v0.1" target="_blank" rel="noopener noreferrer">Mistral-7B</a>,
<a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1" target="_blank" rel="noopener noreferrer">Mixtral-8x7B</a></td>
<td>✅</td>
<td> </td>
<td> </td>
<td>✅</td>
<td>4.34.0 or newer</td>
<td> </td>
<td> </td>
<td>4.36.0 or newer</td>
</tr>
<tr>
<td><a href="https://huggingface.co/Qwen/Qwen-7B-Chat" target="_blank" rel="noopener noreferrer">Qwen-7B</a>,
Expand Down
4 changes: 3 additions & 1 deletion neural_speed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ def __import_package(self, model_type):
import neural_speed.phi_cpp as cpp_model
elif model_type == "whisper":
import neural_speed.whisper_cpp as cpp_model
elif model_type == "mixtral":
import neural_speed.mixtral_cpp as cpp_model
else:
raise TypeError("Unsupported model type {}!".format(model_type))
self.module = cpp_model
Expand Down Expand Up @@ -210,7 +212,7 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
out_count = 0
input_list = None
pad_token_id = generate_kwargs.get("pad_token", None)
if generate_kwargs.get("continuous_batching", False):
if input_ids.shape[0] > 1 and generate_kwargs.get("continuous_batching", True):
input_list = self._cont_batching_input(input_ids, pad_token_id)
else:
input_list = input_ids.tolist()
Expand Down
3 changes: 3 additions & 0 deletions neural_speed/application/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ compile_quant(quant_chatglm quant_model.cpp chatglm chatglm)
compile_quant(quant_chatglm2 quant_model.cpp chatglm2 chatglm2)
compile_quant(quant_baichuan quant_model.cpp baichuan baichuan)
compile_quant(quant_mistral quant_model.cpp mistral llama)
compile_quant(quant_mixtral quant_model.cpp mixtral llama)
compile_quant(quant_qwen quant_model.cpp qwen qwen)
compile_quant(quant_phi quant_model.cpp phi phi)
compile_quant(quant_whisper quant_whisper.cpp whisper whisper)
Expand All @@ -93,6 +94,7 @@ set(mymap_mistral 14)
set(mymap_qwen 15)
set(mymap_phi 16)
set(mymap_whisper 17)
set(mymap_mixtral 18)



Expand Down Expand Up @@ -129,6 +131,7 @@ compile_run(run_baichuan main_run.cpp main_pybind.cpp baichuan baichuan)
compile_run(run_mistral main_run.cpp main_pybind.cpp mistral llama)
compile_run(run_qwen main_run.cpp main_pybind.cpp qwen qwen)
compile_run(run_phi main_run.cpp main_pybind.cpp phi phi)
compile_run(run_mixtral main_run.cpp main_pybind.cpp mixtral llama)

# speech recognition
compile_run(run_whisper audio_run.cpp whisper_pybind.cpp whisper whisper)
44 changes: 27 additions & 17 deletions neural_speed/application/main_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,14 @@ using Response = Query;
using ResponseCallback = std::function<void(std::vector<Response>, int)>;
} // namespace

static std::set<model_archs> cont_batching_model_archs = {MODEL_GPTJ, MODEL_LLAMA};
void init_gpt_params(gpt_params* params, const std::string& model_path, int max_new_tokens = -1, int n_batch = 512,
int ctx_size = 512, int seed = -1, int threads = 8, float repetition_penalty = 1.1f,
int num_beams = 1, bool do_sample = false, int top_k = 40, float top_p = 0.95,
float temperature = 0.8, int min_new_tokens = 0, float length_penalty = 1.0f,
bool early_stopping = false, int n_keep = 0, int n_discard = -1, bool shift_roped_k = false,
int batch_size = 1, model_vocab::id pad_token = -1, const std::string& memory_dtype = "auto",
const bool& continuous_batching = false, const int& max_request_num = MODEL_MAX_REQUEST_NUM,
bool continuous_batching = true, const int& max_request_num = MODEL_MAX_REQUEST_NUM,
const float& model_scratch_enlarge_scale = 1.0f) {
MODEL_ASSERT(params != nullptr);
#ifdef MODEL_NAME
Expand Down Expand Up @@ -114,10 +115,13 @@ void init_gpt_params(gpt_params* params, const std::string& model_path, int max_
params->memory_type = KV_MEM_TYPE_AUTO;
else
fprintf(stderr, "Unexpected memory dtype %s!", memory_dtype.c_str());
if (batch_size > 1 && (!continuous_batching || params->model_arch != model_archs::MODEL_GPTJ)) {
params->memory_type = KV_MEM_TYPE_F16; // TODO(Yi & YZT): MHA IN MULTI-BATCH For More Model Archs
}
// TODO(Yi & YZT): MHA IN MULTI-BATCH For More Model Archs
params->cont_batching = continuous_batching;
if (params->shift_roped_k) params->cont_batching = false;
if (cont_batching_model_archs.count(params->model_arch) == 0) params->cont_batching = false;
if (batch_size > 1 && !continuous_batching) {
params->memory_type = KV_MEM_TYPE_F16;
}
params->max_request_num = std::max(batch_size, max_request_num);
params->min_new_tokens = min_new_tokens;
params->length_penalty = length_penalty;
Expand All @@ -137,8 +141,8 @@ class ModelServer {
int n_batch, int ctx_size, int seed, int threads, float repetition_penalty, int num_beams, bool do_sample,
int top_k, float top_p, float temperature, int min_new_tokens, float length_penalty, bool early_stopping,
int n_keep, int n_discard, bool shift_roped_k, int batch_size, model_vocab::id pad_token,
const std::string& memory_dtype, const bool& continuous_batching, const int& max_request_num,
const float& model_scratch_enlarge_scale, const std::string& policy, const bool& print_log,
const std::string& memory_dtype, bool continuous_batching, const int& max_request_num,
const float& model_scratch_enlarge_scale, const std::string& policy, bool print_log,
const std::function<void()>& init_cb)
: response(response),
waiting(),
Expand Down Expand Up @@ -258,12 +262,16 @@ class ModelServer {
int threads, float repetition_penalty, int num_beams, bool do_sample, int top_k, float top_p,
float temperature, int min_new_tokens, float length_penalty, bool early_stopping, int n_keep,
int n_discard, bool shift_roped_k, int batch_size, model_vocab::id pad_token,
const std::string& memory_dtype, const bool& continuous_batching, const int& max_request_num,
const std::string& memory_dtype, bool continuous_batching, const int& max_request_num,
const float& model_scratch_enlarge_scale) {
init_gpt_params(&params, model_path, max_new_tokens, n_batch, ctx_size, seed, threads, repetition_penalty,
num_beams, do_sample, top_k, top_p, temperature, min_new_tokens, length_penalty, early_stopping,
n_keep, n_discard, shift_roped_k, batch_size, pad_token, memory_dtype, continuous_batching,
max_request_num, model_scratch_enlarge_scale);
if (cont_batching_model_archs.count(params.model_arch) == 0) {
fprintf(stderr, "\nERROR: ModelServer only supports gpt-j, llama!\n");
running = false;
}
}

~ModelServer() {
Expand Down Expand Up @@ -317,8 +325,7 @@ class Model {
float repetition_penalty, int num_beams, bool do_sample, int top_k, float top_p, float temperature,
int min_new_tokens, float length_penalty, bool early_stopping, int n_keep, int n_discard,
bool shift_roped_k, int batch_size, model_vocab::id pad_token, const std::string& memory_dtype,
const bool& continuous_batching, const int& max_request_num,
const float& model_scratch_enlarge_scale);
bool continuous_batching, const int& max_request_num, const float& model_scratch_enlarge_scale);
void reinit();
std::vector<std::vector<model_token>> generate(const std::vector<std::vector<model_token>>& input_ids);
// deprecated API
Expand Down Expand Up @@ -411,7 +418,7 @@ void Model::init_model(const std::string& model_path, int max_new_tokens, int n_
int threads, float repetition_penalty, int num_beams, bool do_sample, int top_k, float top_p,
float temperature, int min_new_tokens, float length_penalty, bool early_stopping, int n_keep,
int n_discard, bool shift_roped_k, int batch_size, model_vocab::id pad_token,
const std::string& memory_dtype, const bool& continuous_batching, const int& max_request_num,
const std::string& memory_dtype, bool continuous_batching, const int& max_request_num,
const float& model_scratch_enlarge_scale) {
init_gpt_params(&params, model_path, max_new_tokens, n_batch, ctx_size, seed, threads, repetition_penalty, num_beams,
do_sample, top_k, top_p, temperature, min_new_tokens, length_penalty, early_stopping, n_keep,
Expand Down Expand Up @@ -466,9 +473,9 @@ bool Model::check_input_and_count_padding(const std::vector<std::vector<model_to
} else { // multi-batch inputs (first token)
ctx->batch_size = input_ids.size();
MODEL_ASSERT(input_ids.size() <= ctx->max_request_num);
static std::set<model_archs> batched_model_archs = {MODEL_GPTJ, MODEL_GPTNEOX, MODEL_CHATGLM};
static std::set<model_archs> batched_model_archs = {MODEL_GPTJ, MODEL_GPTNEOX, MODEL_CHATGLM, MODEL_LLAMA};
if (batched_model_archs.count(params.model_arch) == 0) {
fprintf(stderr, "\nERROR: Only gpt-j, gpt-neox, chatglm support multi-batch generation!\n");
fprintf(stderr, "\nERROR: Only gpt-j, gpt-neox, chatglm, llama support multi-batch generation!\n");
return false;
}
if (ctx->vocab.pad_token_id == -1) {
Expand Down Expand Up @@ -738,7 +745,7 @@ std::vector<std::vector<model_token>> Model::post_beam_search(model_context* lct
const std::vector<model_input>& inputs,
const int& n_threads) {
// TODO(Zhentao): to implement
static std::set<model_archs> supported_archs = {MODEL_GPTJ, MODEL_GPTNEOX};
static std::set<model_archs> supported_archs = {MODEL_GPTJ, MODEL_GPTNEOX, MODEL_LLAMA};
if (supported_archs.count(params.model_arch) != 0) {
return beam_search(lctx, n_predict, inputs, n_threads);
} else {
Expand Down Expand Up @@ -898,6 +905,10 @@ PYBIND11_MODULE(phi_cpp, m)

PYBIND11_MODULE(whisper_cpp, m)

#elif MODEL_NAME_ID == 18

PYBIND11_MODULE(mixtral_cpp, m)

#endif
{
m.doc() = "cpp model python binding";
Expand All @@ -910,7 +921,7 @@ PYBIND11_MODULE(whisper_cpp, m)
py::arg("min_new_tokens") = 0, py::arg("length_penalty") = 1.0, py::arg("early_stopping") = false,
py::arg("n_keep") = 0, py::arg("n_discard") = -1, py::arg("shift_roped_k") = false,
py::arg("batch_size") = 1, py::arg("pad_token") = -1, py::arg("memory_dtype") = "auto",
py::arg("continuous_batching") = false, py::arg("max_request_num") = MODEL_MAX_REQUEST_NUM,
py::arg("continuous_batching") = true, py::arg("max_request_num") = MODEL_MAX_REQUEST_NUM,
py::arg("model_scratch_enlarge_scale") = 1.0f)
.def("generate", &Model::generate, "Generate token with input ids", py::arg("input_ids"))
.def("evaluate", &Model::evaluate, "Evaluate token with input ids and output logits",
Expand Down Expand Up @@ -942,9 +953,8 @@ PYBIND11_MODULE(whisper_cpp, m)
.def_readwrite("token_ids", &Query::token_ids);
py::class_<ModelServer>(m, "ModelServer", py::module_local())
.def(py::init<const ResponseCallback&, const std::string&, bool, int, int, int, int, int, float, int, bool, int,
float, float, int, float, bool, int, int, bool, int, model_vocab::id, const std::string&,
const bool&, const int&, const float&, const std::string&, const bool&,
const std::function<void()>&>(),
float, float, int, float, bool, int, int, bool, int, model_vocab::id, const std::string&, bool,
const int&, const float&, const std::string&, bool, const std::function<void()>&>(),
py::arg("response"), py::arg("model_path"), py::arg("return_prompt") = false, py::arg("max_new_tokens") = -1,
py::arg("n_batch") = 512, py::arg("ctx_size") = 512, py::arg("seed") = -1, py::arg("threads") = 8,
py::arg("repetition_penalty") = 1.1f, py::arg("num_beams") = 1, py::arg("do_sample") = false,
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", hparams["intermediate_size"]))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", 0)) # n_experts
fout.write(struct.pack("i", 0)) # n_expert_used
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
Expand Down
Loading

0 comments on commit 7b83f79

Please sign in to comment.