diff --git a/.gitignore b/.gitignore
index 3128b16..b9f64bc 100755
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,6 @@ __pycache__
 core
 *.so
 *.log
+*.zip
+*.tar.gz
+*.deb
diff --git a/README.md b/README.md
index 70f2452..e4a98d5 100755
--- a/README.md
+++ b/README.md
@@ -160,4 +160,6 @@ echo "setr vpll_clock 100000000"> /sys/kernel/debug/top/clock
 
 断电几分钟，echo 3 > /proc/sys/vm/drop_caches  ，清缓存就正常了，有可能是什么操作造成了内存踩踏
 
+### Q4：执行python_demo时报这个错 ValueError: vector::_M_default_append
 
+A：CMakeLists.txt版本的问题，修改CMakeLists.txt，将第一行改为cmake_minimum_required(VERSION 3.10)
diff --git a/models/Qwen1_5/compile/export_onnx.py b/models/Qwen1_5/compile/export_onnx.py
index 3cb45c6..05f2e72 100755
--- a/models/Qwen1_5/compile/export_onnx.py
+++ b/models/Qwen1_5/compile/export_onnx.py
@@ -258,6 +258,76 @@ def convert_penalty_sample_head():
         do_constant_folding=True,
         opset_version=15)
 
+def build_prompt(query):
+    return f'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n'
+
+def test_net_with_mask():
+    embed = Embedding()
+    blocks = [QwenBlock(i) for i in range(NUM_LAYERS)]
+    block_kvs = [QwenBlockCache(i) for i in range(NUM_LAYERS)]
+    query = """tell me about sophgo in ten word"""
+    print(query)
+    promt = build_prompt(query)
+    import numpy as np
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    ids = tokenizer.encode(promt)
+    print("input ids:{}".format(ids))
+    token_len = len(ids)
+    ori_token_len = token_len
+    ids = ids + (SEQ_LENGTH - token_len) * [0]
+    input_ids = torch.tensor(ids).view(SEQ_LENGTH).to(device)
+    out = embed(input_ids).view(1, SEQ_LENGTH, HIDDEN_SIZE)
+    position_ids = list(range(token_len)) + (SEQ_LENGTH - token_len) * [0]
+    position_ids = torch.tensor([position_ids]).to(device)
+    attention_mask = torch.ones((SEQ_LENGTH, SEQ_LENGTH)).float() * -10000.0
+    for i in range(token_len):
+        for j in range(token_len):
+            if j <= i:
+                attention_mask[i][j] = 0.0
+    attention_mask = attention_mask.view(
+        1, 1, SEQ_LENGTH, SEQ_LENGTH).to(device)
+    k_cache = []
+    v_cache = []
+    for i in range(NUM_LAYERS):
+        # breakpoint()
+        out[:,token_len:] = 0
+        out, k, v = blocks[i](out, position_ids, attention_mask)
+        # k[:, SEQ_LENGTH - token_len:] = k[:, :token_len]
+        # v[:, SEQ_LENGTH - token_len:] = v[:, :token_len]
+        # k[:, :SEQ_LENGTH - token_len] = 0
+        # v[:, :SEQ_LENGTH - token_len] = 0
+        k_cache.append(k)
+        v_cache.append(v)
+    out = out[:, token_len - 1:token_len].view(1, 1, HIDDEN_SIZE)
+    lm = LmHead()
+    greedy_head = GreedyHead()
+    token = greedy_head(lm(out)).view(1)
+    out_ids = [int(token)]
+    word = tokenizer.decode([int(token)])
+    print(word, end="")
+    while int(token) != tokenizer.eos_token_id and token_len < ori_token_len + 10:
+        token_len += 1
+        input_ids = torch.tensor([token]).to(device)
+        out = embed(input_ids).view(1, 1, HIDDEN_SIZE)
+        position_ids = torch.tensor([[token_len - 1]]).to(device)
+        attention_mask = torch.zeros((1, 1, 1, SEQ_LENGTH + 1)).float().to(device)
+        attention_mask[:, :, :, token_len:SEQ_LENGTH] = -10000.0
+        for i in range(NUM_LAYERS):
+            breakpoint()
+            out, k, v = block_kvs[i](out, position_ids, attention_mask, k_cache[i], v_cache[i])
+            k_cache[i][:,token_len:token_len+1] = k
+            v_cache[i][:,token_len:token_len+1] = v
+        print(out)
+        token = greedy_head(lm(out)).view(1)
+        out_ids.append(int(token))
+        word = tokenizer.decode([int(token)])
+        print(word, end="")
+        # np.save(f'torch_{token_len}.npy', out)
+    print("\noutput_ids:{}".format(out_ids))
+    
+
+# test_net_with_mask()
 
 # create folder to store onnx
 if not os.path.exists(folder):