Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

[Runtime Enhence] Extend long input tokens length #157

Merged
merged 22 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions developer_document.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ For simplicity, we take [polyglot](https://huggingface.co/EleutherAI/polyglot-ko

Firstly, we need to add its temp buffer in its [related model-arch header file](neural_speed/models/gptneox/gptneox.h) and [re-compile](README.md#Install).
```diff
static const model_scratch gptneox_mem_req(int n_layers) {
static const model_scratch gptneox_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
switch (n_layers) {
case 44:
return {2048ull * MB, 2048ull * MB, 4096ull * MB};
Expand Down Expand Up @@ -167,7 +167,7 @@ and update [model_name_to_arch()](neural_speed/models/model_utils/model_types.h#
+ NEW_MODEL_13B,
+};

+static const model_scratch new_model_mem_req(int n_layers) {
+static const model_scratch new_model_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
+ switch (n_layers) {
+ case N:
+ return {8192ull * MB, 8192ull * MB, 8192ull * MB};
Expand Down Expand Up @@ -390,7 +390,7 @@ We recommend to use continuous batching way since it has no padding effect and c
+ ne_view_2d(ctx0, KQV_merged_contiguous, head_size * n_head, attn_sl * attn_bs, head_size * n_head * ne_element_size(KQV_merged_contiguous), ne_element_size(KQV_merged_contiguous) * off_sl)));
+ off_sl += head_size * n_head * attn_sl * attn_bs;
```
>Note: You can set larger [`NE_MAX_NODES`](neural_speed/core/ne.h#43) and [`model_scratch_enlarge_scale`](neural_speed/models/llama/llama.h#29) values if out of memory when the inputs' batch size becomes larger.
>Note: You can set larger [`NE_MAX_NODES`](neural_speed/core/ne.h#43) and [`scratch_size_ratio`](neural_speed/models/llama/llama.h#29) values if out of memory when the inputs' batch size becomes larger.

## 2.3. Application
- Q4_0 quant : We can quantize the model generated by convert by adding a quant layer class to quantize it into an int4 low-bit file, so as to obtain better inference performance. Register quant layer class in your new_model_utils.cpp, just like [gptneox_utils.cpp](neural_speed/models/gptneox/gptneox_utils.cpp#L163), replace `gptneox_quant_layer` to your `new_model_quant_layer`.
Expand Down
2 changes: 1 addition & 1 deletion docs/gptq_and_awq.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Validated GPTQ & AWQ models directly from the HuggingFace:
* [Qwen-7B-Chat-GPTQ](https://huggingface.co/TheBloke/Qwen-7B-Chat-GPTQ) & [Qwen-7B-Chat-AWQ](https://huggingface.co/TheBloke/Qwen-7B-Chat-AWQ) & * [Qwen1.5-7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4)
* [SOLAR-10.7B-v1.0-GPTQ](https://huggingface.co/TheBloke/SOLAR-10.7B-v1.0-GPTQ)

Please check more validated GPTQ & AWQ models in the list of [supported_models](./docs/supported_models.md).
Please check more validated GPTQ & AWQ models in the list of [supported_models](./supported_models.md).

## Examples

Expand Down
19 changes: 19 additions & 0 deletions docs/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Neural Speed supports the following models:
<th colspan="4">INT8</th>
<th colspan="4">INT4</th>
<th rowspan="2">Transformer Version</th>
<th rowspan="2">Max tokens length</th>
</tr>
<tr>
<th>RTN</th>
Expand All @@ -36,6 +37,7 @@ Neural Speed supports the following models:
<td>✅</td>
<td>✅</td>
<td>Latest</td>
<td>4096</td>
</tr>
<tr>
<td><a href="https://huggingface.co/decapoda-research/llama-7b-hf" target="_blank" rel="noopener noreferrer">LLaMA-7B</a>,
Expand All @@ -49,6 +51,7 @@ Neural Speed supports the following models:
<td>✅</td>
<td>✅</td>
<td>Latest</td>
<td>2048</td>
</tr>
<td><a href="https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf" target="_blank" rel="noopener noreferrer">CodeLlama-7b</a></td>
<td>✅</td>
Expand All @@ -60,6 +63,7 @@ Neural Speed supports the following models:
<td>✅</td>
<td>✅</td>
<td>Latest</td>
<td>16384</td>
</tr>
</tr>
<td><a href="https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0" target="_blank" rel="noopener noreferrer">Solar-10.7B</a></td>
Expand All @@ -72,6 +76,7 @@ Neural Speed supports the following models:
<td>✅</td>
<td>✅</td>
<td>Latest</td>
<td>4096</td>
</tr>
<tr>
<td><a href="https://huggingface.co/Intel/neural-chat-7b-v3-1" target="_blank" rel="noopener noreferrer">Neural-Chat-7B-v3-1</a>,
Expand All @@ -85,6 +90,7 @@ Neural Speed supports the following models:
<td>✅</td>
<td>✅</td>
<td>Latest</td>
<td>32768</td>
</tr>
<tr>
<td><a href="https://huggingface.co/mistralai/Mistral-7B-v0.1" target="_blank" rel="noopener noreferrer">Mistral-7B</a>,
Expand All @@ -98,6 +104,7 @@ Neural Speed supports the following models:
<td>✅</td>
<td>✅</td>
<td>4.36.0 or newer</td>
<td>32768</td>
</tr>
<tr>
<td><a href="https://huggingface.co/Qwen/Qwen-7B-Chat" target="_blank" rel="noopener noreferrer">Qwen-7B</a>,
Expand All @@ -113,6 +120,7 @@ Neural Speed supports the following models:
<td>✅</td>
<td>✅</td>
<td>Latest</td>
<td>8192 / 32768</td>
</tr>
<tr>
<td><a href="https://huggingface.co/EleutherAI/gpt-j-6b" target="_blank" rel="noopener noreferrer">GPT-J-6B</a></td>
Expand All @@ -125,6 +133,7 @@ Neural Speed supports the following models:
<td>✅</td>
<td>✅</td>
<td>Latest</td>
<td>2048</td>
</tr>
<tr>
<td><a href="https://huggingface.co/EleutherAI/gpt-neox-20b" target="_blank" rel="noopener noreferrer">GPT-NeoX-20B</a></td>
Expand All @@ -137,6 +146,7 @@ Neural Speed supports the following models:
<td> </td>
<td> </td>
<td>Latest</td>
<td>2048</td>
</tr>
<tr>
<td><a href="https://huggingface.co/databricks/dolly-v2-3b" target="_blank" rel="noopener noreferrer">Dolly-v2-3B</a></td>
Expand All @@ -149,6 +159,7 @@ Neural Speed supports the following models:
<td> </td>
<td> </td>
<td>4.28.1 or newer</td>
<td>2048</td>
</tr>
<tr>
<td><a href="https://huggingface.co/mosaicml/mpt-7b" target="_blank" rel="noopener noreferrer">MPT-7B</a>,
Expand All @@ -162,6 +173,7 @@ Neural Speed supports the following models:
<td> </td>
<td> </td>
<td>Latest</td>
<td>2048</td>
</tr>
<tr>
<td><a href="https://huggingface.co/tiiuae/falcon-7b" target="_blank" rel="noopener noreferrer">Falcon-7B</a>,
Expand All @@ -175,6 +187,7 @@ Neural Speed supports the following models:
<td> </td>
<td> </td>
<td>Latest</td>
<td>2048</td>
</tr>
<tr>
<td><a href="https://huggingface.co/bigscience/bloomz-7b1" target="_blank" rel="noopener noreferrer">BLOOM-7B</a></td>
Expand All @@ -187,6 +200,7 @@ Neural Speed supports the following models:
<td> </td>
<td> </td>
<td>Latest</td>
<td>2048</td>
</tr>
<tr>
<td><a href="https://huggingface.co/facebook/opt-125m" target="_blank" rel="noopener noreferrer">OPT-125m</a>,
Expand All @@ -201,6 +215,7 @@ Neural Speed supports the following models:
<td> </td>
<td> </td>
<td>Latest</td>
<td>2048</td>
</tr>
<tr>
<td><a href="https://huggingface.co/THUDM/chatglm-6b" target="_blank" rel="noopener noreferrer">ChatGLM-6B</a>,
Expand All @@ -214,6 +229,7 @@ Neural Speed supports the following models:
<td> </td>
<td> </td>
<td>4.33.1</td>
<td>2048 / 32768</td>
</tr>
<tr>
<td><a href="https://huggingface.co/baichuan-inc/Baichuan-13B-Chat" target="_blank" rel="noopener noreferrer">Baichuan-13B-Chat</a>,
Expand All @@ -227,6 +243,7 @@ Neural Speed supports the following models:
<td> </td>
<td> </td>
<td>4.33.1</td>
<td>4096</td>
</tr>
<tr>
<td><a href="https://huggingface.co/microsoft/phi-2" target="_blank" rel="noopener noreferrer">phi-2</a>,
Expand All @@ -241,6 +258,7 @@ Neural Speed supports the following models:
<td> </td>
<td> </td>
<td>Latest</td>
<td>2048</td>
</tr>
<tr>
<td><a href="https://huggingface.co/openai/whisper-tiny" target="_blank" rel="noopener noreferrer">Whisper-tiny</a>,
Expand All @@ -257,6 +275,7 @@ Neural Speed supports the following models:
<td> </td>
<td> </td>
<td>Latest</td>
<td>448</td>
</tr>
</tbody>
</table>
Expand Down
136 changes: 116 additions & 20 deletions neural_speed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@


class Model:

def __init__(self):
self.module = None
self.model = None
Expand Down Expand Up @@ -84,9 +85,19 @@ def get_model_type(model_config):
model_type = "chatglm2"
return model_type

def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_autoround=False,
weight_dtype="int4", alg="sym", group_size=32,
scale_dtype="fp32", compute_dtype="int8", use_ggml=False, model_hub="huggingface"):
def init(self,
model_name,
use_quant=True,
use_gptq=False,
use_awq=False,
use_autoround=False,
weight_dtype="int4",
alg="sym",
group_size=32,
scale_dtype="fp32",
compute_dtype="int8",
use_ggml=False,
model_hub="huggingface"):
if model_hub == "modelscope":
from modelscope import AutoConfig
self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
Expand Down Expand Up @@ -124,24 +135,28 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_au
self.bin_file = quant_bin

if os.path.exists(self.bin_file):
print("{} existed, will use cache file. Otherwise please remove the file".
format(self.bin_file))
print("{} existed, will use cache file. Otherwise please remove the file".format(self.bin_file))
return

if use_gptq or use_awq or use_autoround:
convert_model(model_name, quant_bin, use_quantized_model=True)
return

if not os.path.exists(fp32_bin):
convert_model(model_name, fp32_bin, "f32", model_hub = model_hub)
convert_model(model_name, fp32_bin, "f32", model_hub=model_hub)
assert os.path.exists(fp32_bin), "Fail to convert pytorch model"

if not use_quant:
print("FP32 model will be used.")
return
self.module.Model.quant_model(model_path=fp32_bin, out_path=quant_bin,
weight_dtype=weight_dtype, alg=alg, group_size=group_size,
scale_dtype=scale_dtype, compute_dtype=compute_dtype, use_ggml=use_ggml)
self.module.Model.quant_model(model_path=fp32_bin,
out_path=quant_bin,
weight_dtype=weight_dtype,
alg=alg,
group_size=group_size,
scale_dtype=scale_dtype,
compute_dtype=compute_dtype,
use_ggml=use_ggml)
assert os.path.exists(quant_bin), "Fail to quantize model"

# clean
Expand All @@ -150,9 +165,11 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_au
def init_from_bin(self, model_type, model_path, **generate_kwargs):
self.__import_package(model_type)
self.model = self.module.Model()

if self.max_request_num == -1:
self.max_request_num = max(generate_kwargs.get("max_request_num",
max_request_num_default), generate_kwargs.get("batch_size", 1))
self.max_request_num = max(generate_kwargs.get("max_request_num", max_request_num_default),
generate_kwargs.get("batch_size", 1))

if "threads" not in generate_kwargs:
threads = os.getenv("OMP_NUM_THREADS")
import platform
Expand All @@ -165,29 +182,107 @@ def init_from_bin(self, model_type, model_path, **generate_kwargs):
generate_kwargs["threads"] = len(os.sched_getaffinity(0))
else:
generate_kwargs["threads"] = int(threads)
self.model.init_model(model_path, **generate_kwargs)

# Setting scratch_size_ratio according to the ctx_size & tokens_length
# If scratch_size_ratio has been set, will not enter this branch.
if generate_kwargs.get("ctx_size") is not None and generate_kwargs.get(
"ctx_size") > 2048 and generate_kwargs.get("scratch_size_ratio") is None:

def get_max_seq_length():
config = self.config.to_dict()
# chatglm2, bloom
if 'seq_length' in config:
return config['seq_length']
# qwen2, llama-2, llama, dolly, gptneox, qwen, qwen1.5, opt, phi
elif 'max_position_embeddings' in config:
return config['max_position_embeddings']
# baichuan, baichuan2
elif 'model_max_length' in config:
return config['model_max_length']
# gptj
elif 'n_positions' in config:
return config['n_positions']
# mpt
elif 'max_seq_len' in config:
return config['max_seq_len']
# chatglm
elif 'max_sequence_length' in config:
return config['max_sequence_length']
# whisper
elif 'max_length' in config:
return config['max_length']
# Falcon does not have these parameters.
elif model_type == "falcon":
return 2048
else:
print("Not found max seq length, setting to default 512")
return 512

# when tokens less than 10240
def get_scratch_size_ratio(size):
if size > 2048 and size <= 4096:
return 2
elif size > 4096 and size <= 8192:
return 4
elif size > 8192 and size <= 10240:
return 8
else:
# more than 10240
return -1

max_seq_length = get_max_seq_length()
ctx_size = generate_kwargs.get("ctx_size")

if ctx_size > max_seq_length:
print(f'max_seq_length is {max_seq_length}, but ctx_size is {ctx_size}. Please reduce ctx_size.')
exit(0)

if max_seq_length > 2048 and max_seq_length <= 4096:
generate_kwargs["scratch_size_ratio"] = 2
elif max_seq_length > 4096 and max_seq_length <= 8192:
generate_kwargs["scratch_size_ratio"] = 4
elif max_seq_length > 8192:
if get_scratch_size_ratio(ctx_size) != -1:
generate_kwargs["scratch_size_ratio"] = get_scratch_size_ratio(ctx_size)
else:
if max_seq_length == 16384:
generate_kwargs["scratch_size_ratio"] = 12
elif max_seq_length == 32768:
if ctx_size < 20480:
generate_kwargs["scratch_size_ratio"] = 20
else:
generate_kwargs["scratch_size_ratio"] = 35

self.model.init_model(model_path, **generate_kwargs)

def quant_model(self, model_type, model_path, out_path, **quant_kwargs):
self.__import_package(model_type)
self.module.Model.quant_model(model_path=model_path, out_path=out_path, **quant_kwargs)

def generate(self,
input_ids,
streamer=None,
interactive=False,
ignore_prompt=False,
stopping_criteria=None,
**generate_kwargs):
batch_size = input_ids.shape[0]

def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=False,
stopping_criteria=None, **generate_kwargs):
max_new_tokens = generate_kwargs.get("max_new_tokens", -1)
input_bs = input_ids.shape[0]
max_request_num = generate_kwargs.pop("max_request_num", max_request_num_default)
reinit_from_bin = False
if max_request_num > self.max_request_num or input_bs > self.max_request_num:
if max_request_num > self.max_request_num or batch_size > self.max_request_num:
reinit_from_bin = True
if self.max_request_num > 0:
print("Will start to reinit model from bin due to different max request num.")
self.max_request_num = max(input_bs, max_request_num)
self.max_request_num = max(batch_size, max_request_num)

if self.model is None or reinit_from_bin:
self.init_from_bin(self.model_type, self.bin_file, batch_size=input_bs,
max_request_num = self.max_request_num, **generate_kwargs)
self.init_from_bin(self.model_type,
self.bin_file,
batch_size=batch_size,
max_request_num=self.max_request_num,
**generate_kwargs)
self.generate_round = 0
elif not interactive:
self.model.reinit()
Expand All @@ -208,6 +303,7 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
assert input_ids.shape[0] == 1, "Streamer only supports batch size 1."
assert beam_search == False, "ERROR, can not use streamer when use beam search for generation! \
Make sure that `num_beams` is set to 1."

if self.generate_round == 0 and not ignore_prompt:
streamer.put(input_ids)

Expand Down Expand Up @@ -284,6 +380,6 @@ def _cont_batching_input(self, input_ids, pad_token_id=None):
for il in range(len(input_list)):
count = input_list[il].count(pti)
# padding left
del input_list[il][0: count]
del input_list[il][0:count]
assert input_list[il] != [], "there are all pad tokens in batch {}.".format(il)
return input_list
Loading
Loading