diff --git a/.gitignore b/.gitignore index 3128b16..b9f64bc 100755 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ __pycache__ core *.so *.log +*.zip +*.tar.gz +*.deb diff --git a/README.md b/README.md index 70f2452..e4a98d5 100755 --- a/README.md +++ b/README.md @@ -160,4 +160,6 @@ echo "setr vpll_clock 100000000"> /sys/kernel/debug/top/clock 断电几分钟,echo 3 > /proc/sys/vm/drop_caches ,清缓存就正常了,有可能是什么操作造成了内存踩踏 +### Q4:执行python_demo时报这个错 ValueError: vector::_M_default_append +A:CMakeLists.txt版本的问题,修改CMakeLists.txt,将第一行改为cmake_minimum_required(VERSION 3.10) diff --git a/models/Qwen1_5/compile/export_onnx.py b/models/Qwen1_5/compile/export_onnx.py index 3cb45c6..05f2e72 100755 --- a/models/Qwen1_5/compile/export_onnx.py +++ b/models/Qwen1_5/compile/export_onnx.py @@ -258,6 +258,76 @@ def convert_penalty_sample_head(): do_constant_folding=True, opset_version=15) +def build_prompt(query): + return f'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n' + +def test_net_with_mask(): + embed = Embedding() + blocks = [QwenBlock(i) for i in range(NUM_LAYERS)] + block_kvs = [QwenBlockCache(i) for i in range(NUM_LAYERS)] + query = """tell me about sophgo in ten word""" + print(query) + promt = build_prompt(query) + import numpy as np + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + ids = tokenizer.encode(promt) + print("input ids:{}".format(ids)) + token_len = len(ids) + ori_token_len = token_len + ids = ids + (SEQ_LENGTH - token_len) * [0] + input_ids = torch.tensor(ids).view(SEQ_LENGTH).to(device) + out = embed(input_ids).view(1, SEQ_LENGTH, HIDDEN_SIZE) + position_ids = list(range(token_len)) + (SEQ_LENGTH - token_len) * [0] + position_ids = torch.tensor([position_ids]).to(device) + attention_mask = torch.ones((SEQ_LENGTH, SEQ_LENGTH)).float() * -10000.0 + for i in range(token_len): + for j in range(token_len): + if j <= i: + attention_mask[i][j] = 0.0 + attention_mask = attention_mask.view( + 1, 1, SEQ_LENGTH, SEQ_LENGTH).to(device) + k_cache = [] + v_cache = [] + for i in range(NUM_LAYERS): + # breakpoint() + out[:,token_len:] = 0 + out, k, v = blocks[i](out, position_ids, attention_mask) + # k[:, SEQ_LENGTH - token_len:] = k[:, :token_len] + # v[:, SEQ_LENGTH - token_len:] = v[:, :token_len] + # k[:, :SEQ_LENGTH - token_len] = 0 + # v[:, :SEQ_LENGTH - token_len] = 0 + k_cache.append(k) + v_cache.append(v) + out = out[:, token_len - 1:token_len].view(1, 1, HIDDEN_SIZE) + lm = LmHead() + greedy_head = GreedyHead() + token = greedy_head(lm(out)).view(1) + out_ids = [int(token)] + word = tokenizer.decode([int(token)]) + print(word, end="") + while int(token) != tokenizer.eos_token_id and token_len < ori_token_len + 10: + token_len += 1 + input_ids = torch.tensor([token]).to(device) + out = embed(input_ids).view(1, 1, HIDDEN_SIZE) + position_ids = torch.tensor([[token_len - 1]]).to(device) + attention_mask = torch.zeros((1, 1, 1, SEQ_LENGTH + 1)).float().to(device) + attention_mask[:, :, :, token_len:SEQ_LENGTH] = -10000.0 + for i in range(NUM_LAYERS): + breakpoint() + out, k, v = block_kvs[i](out, position_ids, attention_mask, k_cache[i], v_cache[i]) + k_cache[i][:,token_len:token_len+1] = k + v_cache[i][:,token_len:token_len+1] = v + print(out) + token = greedy_head(lm(out)).view(1) + out_ids.append(int(token)) + word = tokenizer.decode([int(token)]) + print(word, end="") + # np.save(f'torch_{token_len}.npy', out) + print("\noutput_ids:{}".format(out_ids)) + + +# test_net_with_mask() # create folder to store onnx if not os.path.exists(folder):