diff --git a/.gitignore b/.gitignore
old mode 100644
new mode 100755
diff --git a/.gitmodules b/.gitmodules
old mode 100644
new mode 100755
diff --git a/README.md b/README.md
old mode 100644
new mode 100755
diff --git a/models/Baichuan2/README.md b/models/Baichuan2/README.md
new file mode 100644
index 0000000..635ccb5
--- /dev/null
+++ b/models/Baichuan2/README.md
@@ -0,0 +1,182 @@
+![image](./assets/sophgo_chip.png)
+
+# Baichuan2-TPU
+
+本项目实现BM1684X部署语言大模型[Baichuan2-7B](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)。通过[TPU-MLIR](https://github.com/sophgo/tpu-mlir)编译器将模型转换成bmodel，并采用c++代码将其部署到BM1684X的PCIE环境，或者SoC环境。
+
+下文中默认是PCIE环境；如果是SoC环境，按提示操作即可。
+
+# 目录说明
+```
+.
+├── README.md                           #使用说明
+├── requirements.txt                    #需要使用的python wheel包
+├── assets
+├── compile
+│   ├── compile.sh                      #用来编译TPU模型的脚本
+│   ├── export_onnx_fast.py             #用来导出onnx的脚本
+│   ├── modeling_baichuan.py            #替换Baichuan2-7B-chat的对应文件的备份
+│   └── torch_inference.py              #torch推理脚本
+├── demo                                #Baichuan2 c++代码文件
+│   ├── CMakeLists.txt
+│   └── demo.cpp                        #主程序
+├── src                                 #编译依赖库
+│   ├── include
+│   ├── lib_pcie
+│   └── lib_soc
+├── model                               #模型文件（bmodel需下载）
+│   ├── baichuan2-7b-test_int8.bmodel
+│   └── tokenizer.model
+└── web_demo                            #web demo，提供网页对话示例
+    ├── chat.cpp
+    ├── chat.py
+    ├── CMakeLists.txt
+    └── web_demo.py
+```
+----------------------------
+
+# 【阶段一】模型编译
+
+## 注意点
+* 模型编译必须要在docker内完成，无法在docker外操作
+
+### 步骤一：模型下载
+Baichuan2模型在hugging face上完全开源，供用户下载使用。请根据官网下载步骤进行模型与权重的下载。
+```bash
+# Make sure you have git-lfs installed (https://git-lfs.com)
+git lfs install
+git clone https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat
+
+# if you want to clone without large files – just their pointers
+# prepend your git clone with the following env var:
+GIT_LFS_SKIP_SMUDGE=1
+```
+
+### 步骤二：下载docker
+
+下载docker，启动容器，如下：
+
+``` shell
+docker pull sophgo/tpuc_dev:latest
+
+# myname1234 is just an example, you can set your own name
+docker run --privileged --name myname1234 -v $PWD:/workspace -it sophgo/tpuc_dev:latest
+```
+
+### 步骤三：下载TPU-MLIR代码并编译
+
+``` shell
+git clone git@github.com:sophgo/tpu-mlir.git
+cd tpu-mlir
+source ./envsetup.sh
+./build.sh
+```
+* PS：重新进入docker环境并且需要编译模型时，必须在此路径下执行上述`source ./envsetup.sh` 和 `./build.sh`才能完成后续模型编译。
+
+### 步骤四：下载本项目，安装requirements.txt
+下载transfomers、sentencepiece、Baichuan2-TPU以及百度网盘里的.bin模型，并替换transformers里面的modeling_baichuan.py
+
+``` shell
+git clone https://github.com/sophgo/Baichuan2-TPU.git
+cd Baichuan2
+pip install -r requirements.txt
+```
+
+### 步骤五：替换modeling_baichuan.py, 修改config.json, 生成onnx文件
+修改Baichuan2-7B-chat项目中config.json文件中max_position_embeddings与model_max_length，从4096变为512
+
+``` shell
+cd compile
+cp modeling_baichuan.py $BAICHUAN2_PATH
+python export_onnx_fast.py --model_path your_model_path
+```
+
+* PS1：your_model_path 指的是原模型下载后的地址, 如:"../../torch2onnx/Baichuan2-7B-Chat", 可以根据需要选择使用7b模型还是13b模型。
+* PS2：如果你想要debug，而不是一下子生成完成全部的onnx模型，可以将240行的num_layers改成1, 并结合函数对比单个block情况下是否可以和
+
+### 步骤六：生成bmodel文件
+
+生成模型
+
+``` shell
+./compile.sh --mode int8
+```
+
+* PS1：编译完成后最终会在Llama2-TPU/compile路径下生成名为baichuan2-{X}b_{Y}_{Z}dev.bmodel,其中X为7或13，Y为`compile.sh`时选择的`mode`的数据类型,Z为推理的芯片数量(如果不指定num_device, 会省略{Z}dev的部分)
+* PS2：生成bmodel耗时大概3小时以上，建议64G内存以及200G以上硬盘空间，不然很可能OOM或者no space left
+* PS3：目前给定的lib_pcie和lib_soc部分仅包含单芯的动态库，多芯部分会在后续更新
+
+----------------------------
+
+# 阶段二：可执行文件生成（可以跳过）
+
+## 准备
+* bmodel模型准备：经过阶段一后将得到编译好的bmodel文件【也可以使用我们提供的现成编译好的bmodel文件】，下载方式为:
+```shell
+cd Baichuan2-TPU/model
+pip3 install dfss
+# baichuan2-7B
+python3 -m dfss --url=open@sophgo.com:sophon-demo/baichuan2/baichuan2-7b-test_int8.bmodel
+```
+将得到编译好的int8单芯bmodel模型文件。
+
+## 编译程序(C++版本)
+
+执行如下编译，默认是PCIE版本：
+
+```shell
+cd Baichuan2-TPU/demo
+mkdir build
+cd build
+cmake ..
+make
+```
+
+如果是SoC版本，有两种编译方法：
+
+方法1：直接将demo目录拷贝到SoC环境，按以上步骤编译(推荐)
+
+方法2：docker中交叉编译，如下操作
+
+```shell
+wget https://releases.linaro.org/components/toolchain/binaries/7.5-2019.12/aarch64-linux-gnu/gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu.tar.xz
+tar -xvf gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu.tar.xz
+mv gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu /opt/aarch64-linux-gnu-7.5.0
+cd Baichuan2-TPU/demo
+mkdir build
+cd build
+cmake .. -DTARGET_ARCH=soc # soc 只有一颗芯片，因此不支持多芯编译
+make -j
+```
+
+编译生成llama2可执行程序。
+
+运行`baichuan2`:
+```shell
+./baichuan2 --model ../model/baichuan2-7b-test_int8.bmodel --dev dev_id
+```
+
+## 编译程序(Python Web版本)【单芯】
+
+```shell
+pip install gradio==3.39.0
+cd Baichuan2-TPU/web_demo
+mkdir build
+cd build
+cmake ..
+make -j
+```
+
+编译成功会在`build`文件夹下生成`libtpuchat.so*`, 此时可以在web_demo.py中指定bmodel\_path token\_path device\_id, lib_path(编译生产的`libtpuchat.so*`文件, 默认路径是`./build`下), 以及dev_id。
+```python
+python web_demo.py
+```
+即可成功运行web的demo。
+* PS：在用户不修改上述token\_path的lib\_path的存放路径前提下只需指定bmodel\_path即可运行程序。
+
+如果是SoC环境，参考C++版本
+
+* PS：尽量下载gradio==3.39.0版本，不然会出现各种问题！！
+
+# 常见问题
+* 请根据实际block数目调整`demo/chat`中或者`web_demo/chat.cpp`中的NUM_LAYERS，默认是使用Baichuan2-7B(NUM_LAYERS=32)
\ No newline at end of file
diff --git a/models/Baichuan2/compile/compile.sh b/models/Baichuan2/compile/compile.sh
new file mode 100755
index 0000000..c71c28a
--- /dev/null
+++ b/models/Baichuan2/compile/compile.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+set -ex
+models=
+mode="f16"
+folder="tmp"
+num_device=1
+mode_args=""
+device_args=""
+quantize_args="--quantize F16"
+name=""
+num_layers=
+out_model=$name.bmodel
+
+if [ -z "$name" ]; then
+    name="baichuan2-7b"
+    echo "Compile Baichuan2-7B"
+else
+    name="baichuan2-13b"
+    echo "Compile Baichuan2-13B"
+fi
+
+while [[ $# -gt 0 ]]; do
+    key="$1"
+
+    case $key in
+        --mode)
+            mode="$2"
+            shift 2
+            ;;
+        --num_device)
+            num_device="$2"
+            shift 2
+            ;;
+        --name)
+            name="$2"
+            shift 2
+            ;;
+        *)
+            echo "Invalid option: $key" >&2
+            exit 1
+            ;;
+        :)
+            echo "Option -$OPTARG requires an argument." >&2
+            exit 1
+            ;;
+    esac
+done
+
+if [ x$mode == x"int8" ] || [ x$mode == x"int4" ]; then
+    if [ x$mode == x"int8" ]; then
+        quantize_args="--quantize W8F16"
+    else
+        quantize_args="--quantize W4BF16 --q_group_size 64"
+    fi
+    out_model=$name'_'$mode'.bmodel'
+fi
+
+if [ x$name == x"baichuan2-7b" ] || [ x$name == x"baichuan2-13b" ]; then
+    if [ x$name == x"baichuan2-7b" ]; then
+        num_layers=32
+    else
+        num_layers=40
+    fi
+fi
+
+if [ x$num_device != x1 ]; then
+    device_args="--num_device $num_device"
+    out_model=$name'_'$mode'_'$num_device'dev.bmodel'
+else
+    out_model=$name'_'$mode'_1dev.bmodel'
+fi
+
+outdir=${folder}/embedding
+mkdir -p $outdir
+pushd $outdir
+
+seqlen=512
+model_transform.py \
+    --model_name embedding \
+    --model_def ../embedding.onnx \
+    --input_shapes [[$seqlen]] \
+    --mlir embedding_${seqlen}.mlir
+
+
+model_deploy.py \
+    --mlir embedding_$seqlen.mlir \
+    --quantize F16 \
+    --chip bm1684x \
+    $device_args \
+    --model embedding_${seqlen}_f16.bmodel
+
+model_transform.py \
+    --model_name embedding_cache \
+    --model_def ../embedding.onnx \
+    --input_shapes [[1]] \
+    --mlir embedding_1.mlir
+
+
+model_deploy.py \
+    --mlir embedding_1.mlir \
+    --quantize F16 \
+    --chip bm1684x \
+    $device_args \
+    --model embedding_1_f16.bmodel
+
+rm *.npz
+
+models=$models' '$outdir'/embedding_1_f16.bmodel '$outdir'/embedding_'$seqlen'_f16.bmodel '
+
+popd
+
+echo $models
+
+outdir=${folder}/$mode"_"$num_device"dev"/lm_head
+mkdir -p $outdir
+pushd $outdir
+
+model_transform.py \
+    --model_name lm_head \
+    --model_def ../../lm_head.onnx \
+    --mlir lm_head.mlir
+
+
+model_deploy.py \
+    --mlir lm_head.mlir \
+    --quantize F16 \
+    --chip bm1684x \
+    --model lm_head.bmodel
+
+rm *.npz
+
+models=${models}${outdir}'/lm_head.bmodel '
+popd
+
+echo $models
+
+outdir=${folder}/$mode"_"$num_device"dev"/block
+mkdir -p $outdir
+
+pushd $outdir
+mkdir -p $outdir
+
+for ((i=0; i<$num_layers; i++))
+do
+
+model_transform.py \
+    --model_name block_$i \
+    --model_def ../../block_$i.onnx \
+    --mlir block_$i.mlir
+
+model_deploy.py \
+    --mlir block_$i.mlir \
+    $quantize_args \
+    --chip bm1684x \
+    --quant_output \
+    --quant_output_list 2,3 \
+    $device_args \
+    --model block_$i.bmodel
+
+model_transform.py \
+    --model_name block_cache_$i \
+    --model_def ../../block_cache_${i}.onnx \
+    --mlir block_cache_$i.mlir
+
+model_deploy.py \
+    --mlir block_cache_$i.mlir \
+    $quantize_args \
+    --chip bm1684x \
+    --quant_input \
+    --quant_output \
+    --quant_input_list 4,5 \
+    --quant_output_list 2,3 \
+    $device_args \
+    --model block_cache_$i.bmodel
+
+rm *.npz
+# rm ../../block_$i.onnx
+# rm ../../block_cache_$i.onnx
+
+models=${models}${outdir}'/block_'$i'.bmodel '$outdir'/block_cache_'$i'.bmodel '
+
+done
+popd
+echo $models
+
+model_tool --combine $models -o $out_model
diff --git a/models/Baichuan2/compile/export_onnx_fast.py b/models/Baichuan2/compile/export_onnx_fast.py
new file mode 100755
index 0000000..dbab131
--- /dev/null
+++ b/models/Baichuan2/compile/export_onnx_fast.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+# ==============================================================================
+#
+# Copyright (C) 2023 Sophgo Technologies Inc.  All rights reserved.
+#
+# TPU-MLIR is licensed under the 2-Clause BSD License except for the
+# third-party components.
+#
+# ==============================================================================
+
+import os
+import datetime
+import math
+import unittest
+import torch
+import random
+import sys
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.utils import GenerationConfig
+import numpy as np
+import argparse
+
+# folder = "./tmp"
+# model_path = "/home/junqian/workspace/llm/baichuan2-7B/Baichuan2-7B-Chat"
+parser = argparse.ArgumentParser(description='export Baichuan2 onnx.')
+parser.add_argument('--model_path', type=str, default="../baichuan2-7B/Baichuan2-7B-Chat/", help='path to the torch model.')
+parser.add_argument('--max_length', type=int, default=512, help="max sequence length")
+
+args = parser.parse_args()
+
+model_path = args.model_path
+MAX_LEN = args.max_length
+folder = "./tmp"
+
+origin_model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
+origin_model.generation_config = GenerationConfig.from_pretrained(model_path)
+origin_model.eval()
+transformer = origin_model.model
+config = origin_model.config
+
+
+for param in origin_model.parameters():
+    param.requires_grad = False
+
+num_layers = config.num_hidden_layers
+hidden_size = config.hidden_size
+num_attention_heads = config.num_attention_heads
+head_dim = hidden_size // num_attention_heads
+layers = transformer.layers
+tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
+
+
+def set_seed(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    np.random.seed(seed)
+    random.seed(seed)
+
+class Embedding(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_ids):
+        return transformer.embed_tokens(input_ids)
+
+
+class Block(torch.nn.Module):
+
+    def __init__(self, layer_id):
+        super().__init__()
+        # params
+        self.layer_id = layer_id
+        self.layer = layers[layer_id]
+
+    def forward(self, hidden_states, position_ids, attention_mask):
+        hidden_states, past_kv = self.layer(hidden_states,
+                                            attention_mask,
+                                            position_ids,
+                                            use_cache=True)
+        past_k, past_v = past_kv
+        return hidden_states, past_k, past_v
+
+
+class BlockCache(torch.nn.Module):
+
+    def __init__(self, layer_id):
+        super().__init__()
+        # params
+        self.layer_id = layer_id
+        self.layer = layers[layer_id]
+
+    def forward(self, hidden_states, position_ids, attention_mask, past_k,
+                past_v):
+        hidden_states, past_kv = self.layer(hidden_states,
+                                            attention_mask,
+                                            position_ids=position_ids,
+                                            past_key_value=(past_k, past_v),
+                                            use_cache=True)
+        past_k, past_v = past_kv
+        return hidden_states, past_k, past_v
+
+
+class LmHead(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, hidden_states):
+        hidden_states = transformer.norm(hidden_states)
+        m_logits = origin_model.lm_head(hidden_states)
+        _, token = torch.topk(m_logits, 1)
+        return token
+
+
+def convert_block(layer_id):
+    # input
+    # MAX_LEN + 1 for model combine
+    hidden_states = torch.randn((1, MAX_LEN, hidden_size))
+    position_ids = torch.tensor([range(MAX_LEN)], dtype=torch.long)
+    attention_mask = -1000 * torch.ones((1, 1, MAX_LEN, MAX_LEN), dtype=torch.float32).triu(diagonal=1)
+    model = Block(layer_id)
+    # hiddeng_states = model(input_ids, position_ids)
+
+    torch.onnx.export(
+        model, (hidden_states, position_ids, attention_mask),
+        f'./tmp/block_{layer_id}.onnx',
+        verbose=False,
+        input_names=['input_states', 'position_ids', 'attention_mask'],
+        output_names=['hidden_states', 'past_k', 'past_v'],
+        do_constant_folding=True,
+        opset_version=15)
+
+
+def convert_block_cache(layer_id):
+    # input
+    np.random.seed(42)
+    hidden_states = torch.randn((1, 1, hidden_size))
+    position_ids = torch.tensor([range(1)], dtype=torch.long)
+    attention_mask = -1000 * torch.ones((1, 1, 1, MAX_LEN + 1), dtype=torch.float32).triu(diagonal=0)
+    past_k = torch.randn((1, MAX_LEN, num_attention_heads, head_dim))
+    past_v = torch.randn((1, MAX_LEN, num_attention_heads, head_dim))
+    model = BlockCache(layer_id)
+    # hiddeng_states = model(input_ids, position_ids)
+
+    torch.onnx.export(
+        model, (hidden_states, position_ids, attention_mask, past_k, past_v),
+        f'./tmp/block_cache_{layer_id}.onnx',
+        verbose=False,
+        input_names=[
+            'input_states', 'position_ids', 'attention_mask', 'history_k',
+            'history_v'
+        ],
+        output_names=['hidden_states', 'past_k', 'past_v'],
+        do_constant_folding=True,
+        opset_version=15)
+
+
+def convert_embedding():
+    model = Embedding()
+    torch.onnx.export(model, (torch.tensor([0, 1, 2, 3])),
+                      f'./tmp/embedding.onnx',
+                      verbose=False,
+                      input_names=['input_ids'],
+                      output_names=['input_embed'],
+                      dynamic_axes={"input_ids": {
+                          0: "length"
+                      }},
+                      do_constant_folding=True,
+                      opset_version=15)
+
+
+def convert_lm_head():
+    model = LmHead()
+    input = torch.randn(1, hidden_size)
+    torch.onnx.export(model, (input),
+                      f'./tmp/lm_head.onnx',
+                      verbose=False,
+                      input_names=['hidden_states'],
+                      output_names=['token'],
+                      do_constant_folding=True,
+                      opset_version=15)
+
+
+def test_net_with_mask():
+    embed = Embedding()
+    blocks = [Block(i) for i in range(num_layers)]
+    block_kvs = [BlockCache(i) for i in range(num_layers)]
+    ids = tokenizer.encode('解释一下“温故而知新”这句话的意思。')
+    print("input ids:{}".format(ids))
+    token_len = len(ids)
+    ids = ids + (MAX_LEN - token_len) * [0]
+    input_ids = torch.tensor(ids).view(MAX_LEN)
+    out = embed(input_ids).view(1, MAX_LEN, hidden_size)
+    position_ids = list(range(token_len)) + (MAX_LEN - token_len) * [0]
+    position_ids = torch.tensor([position_ids])
+    attention_mask = -1000 * torch.ones((MAX_LEN, MAX_LEN))
+    for i in range(token_len):
+        for j in range(token_len):
+            if j <= i:
+                attention_mask[i][j] = 0
+    attention_mask = attention_mask.view(1, 1, MAX_LEN, MAX_LEN)
+    k_cache = []
+    v_cache = []
+    for i in range(num_layers):
+        out, k, v = blocks[i](out, position_ids, attention_mask)
+        k[:,MAX_LEN - token_len:] = k[:,:token_len]
+        v[:,MAX_LEN - token_len:] = v[:,:token_len]
+        k[:,:MAX_LEN - token_len] = 0
+        v[:,:MAX_LEN - token_len] = 0
+        k_cache.append(k)
+        v_cache.append(v)
+    out = out[:,token_len - 1:token_len].view(1, hidden_size)
+    lm = LmHead()
+    token = lm(out).view(1)
+    out_ids = [int(token)]
+    word = tokenizer._convert_id_to_token(int(token[0]))
+    print(word, end="")
+    while token > 2 and token_len < 64:
+        token_len += 1
+        input_ids = torch.tensor([token])
+        out = embed(input_ids).view(1, 1, hidden_size)
+        position_ids = torch.tensor([[token_len - 1]])
+        attention_mask = -1000 * torch.ones((1, 1, 1, MAX_LEN + 1))
+        attention_mask[:, :, :, MAX_LEN + 1 - token_len:] = 0
+        for i in range(num_layers):
+            out, present_k_cache, present_v_cache = block_kvs[i](out, position_ids,
+                                                    attention_mask,
+                                                    k_cache[i], v_cache[i])
+            new_k = torch.zeros(k_cache[i].shape)
+            new_v = torch.zeros(v_cache[i].shape)
+            new_k[:,MAX_LEN - token_len:MAX_LEN - 1] = k_cache[i][:,MAX_LEN - token_len + 1:]
+            new_v[:,MAX_LEN - token_len:MAX_LEN - 1] = v_cache[i][:,MAX_LEN - token_len + 1:]
+            new_k[:,MAX_LEN - 1:] = present_k_cache
+            new_v[:,MAX_LEN - 1:] = present_v_cache
+            k_cache[i] = new_k
+            v_cache[i] = new_v
+        token = lm(out).view(1)
+        out_ids.append(int(token))
+        word = tokenizer._convert_id_to_token(int(token[0]))
+        print(word, end="")
+    print("\noutput_ids:{}".format(out_ids))
+
+set_seed(42)
+# test_net_with_mask()
+
+# create folder to store onnx
+if not os.path.exists(folder):
+    os.makedirs(folder)
+
+
+# export models
+for i in range(num_layers):
+    print("convert_block_{}".format(i))
+    convert_block_cache(i)
+    convert_block(i)
+print("convert_embedding")
+convert_embedding()
+print("convert_lmhead")
+convert_lm_head()
diff --git a/models/Baichuan2/compile/modeling_baichuan.py b/models/Baichuan2/compile/modeling_baichuan.py
new file mode 100644
index 0000000..5046dfc
--- /dev/null
+++ b/models/Baichuan2/compile/modeling_baichuan.py
@@ -0,0 +1,792 @@
+# Copyright 2023 Baichuan Inc. All Rights Reserved.
+
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .configuration_baichuan import BaichuanConfig
+from .generation_utils import build_chat_input, TextIterStreamer
+
+import math
+from typing import List, Optional, Tuple, Union
+from threading import Thread
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.nn import functional as F
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.generation.utils import GenerationConfig
+from transformers.utils import logging, ContextManagers
+
+import os
+from contextlib import contextmanager
+logger = logging.get_logger(__name__)
+
+try:
+    from xformers import ops as xops
+except ImportError:
+    xops = None
+    logger.warning(
+        "Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers\npip install xformers."
+    )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+        input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    if len(mask.size()) == 3:
+        bsz, src_len, _ = mask.size()
+        tgt_len = tgt_len if tgt_len is not None else src_len
+        expanded_mask = mask[:,None,:,:].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    else:
+        bsz, src_len = mask.size()
+        tgt_len = tgt_len if tgt_len is not None else src_len
+        expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        self.max_seq_len_cached = max_position_embeddings
+        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32)
+        freqs = torch.outer(t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.cos_cached = emb.cos()[None, None, :, :].to(torch.float32)
+        self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        if seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+            t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32)
+            freqs = torch.outer(t, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            self.cos_cached = emb.cos()[None, None, :, :].to(torch.float32).to(x.device)
+            self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32).to(x.device)
+        elif self.cos_cached.device != x.device:
+            self.cos_cached = self.cos_cached.to(x.device)
+            self.sin_cached = self.sin_cached.to(x.device) 
+        return (
+            self.cos_cached[:, :, :seq_len, ...],
+            self.sin_cached[:, :, :seq_len, ...],
+        )
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos_, sin_, position_ids):
+    cos = cos_.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin_.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    cos = cos.transpose(1, 2)
+    sin = sin.transpose(1, 2)
+    q_embed = (q.float() * cos) + (rotate_half(q.float()) * sin)
+    k_embed = (k.float() * cos) + (rotate_half(k.float()) * sin)
+    return q_embed.to(q.dtype), k_embed.to(k.dtype)
+
+
+class MLP(nn.Module):
+    def __init__(
+            self,
+            hidden_size: int,
+            intermediate_size: int,
+            hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+class Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: BaichuanConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.W_pack = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = RotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        proj = self.W_pack(hidden_states)
+        proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2)
+        query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim)
+        value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim)
+
+        kv_seq_len = key_states.shape[-3]
+        if past_key_value is not None:
+            kv_seq_len = kv_seq_len + past_key_value[0].shape[-3]
+        if past_key_value is not None:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len-1)
+        else:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
+        past_kv = (key_states, value_states) if use_cache else None
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=1)
+            value_states = torch.cat([past_key_value[1], value_states], dim=1)
+
+        
+        if xops is not None and self.training:
+            attn_weights = None
+            query_states = query_states.transpose(1, 2)
+            key_states = key_states.transpose(1, 2)
+            value_states = value_states.transpose(1, 2)
+            attn_output = xops.memory_efficient_attention(
+                query_states, key_states, value_states, attn_bias=xops.LowerTriangularMask()
+            )
+        else:
+            with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
+                query_states = query_states.transpose(1, 2)
+                key_states = key_states.transpose(1, 2)
+                value_states = value_states.transpose(1, 2)
+                attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = attention_mask)
+            attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_kv
+    
+
+class DecoderLayer(nn.Module):
+    def __init__(self, config: BaichuanConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Attention(config=config)
+        self.mlp = MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class BaichuanPreTrainedModel(PreTrainedModel):
+    config_class = BaichuanConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DecoderLayer"]
+    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BaichuanModel):
+            module.gradient_checkpointing = value
+
+
+class BaichuanModel(BaichuanPreTrainedModel):
+    def __init__(self, config: BaichuanConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class NormHead(nn.Module):
+    def __init__(self, hidden_size, vocab_size, bias=False):
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty((vocab_size, hidden_size)))
+        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        self.first_flag = True
+
+    def forward(self, hidden_states):
+        if self.training:
+            norm_weight = nn.functional.normalize(self.weight)
+            self.first_flag = True
+        elif self.first_flag:
+            self.first_flag = False
+            self.weight.data = nn.functional.normalize(self.weight)
+            norm_weight = self.weight
+        else:
+            norm_weight = self.weight
+        return nn.functional.linear(hidden_states, norm_weight)
+
+_init_weights = True
+@contextmanager
+def no_init_weights(_enable=True):
+    global _init_weights
+    old_init_weights = _init_weights
+    if _enable:
+        _init_weights = False
+    try:
+        yield
+    finally:
+        _init_weights = old_init_weights
+
+class BaichuanForCausalLM(BaichuanPreTrainedModel):
+    def __init__(self, config, *model_args, **model_kwargs):
+        super().__init__(config, *model_args, **model_kwargs)
+        self.model = BaichuanModel(config)
+
+        self.lm_head = NormHead(config.hidden_size, config.vocab_size, bias=False)
+        if hasattr(config, "quantization_config") and isinstance(config.quantization_config, dict) and config.quantization_config.get('load_in_4bit', False):
+            try:
+                from .quantizer import quantize_offline, init_model_weight_int4
+            except ImportError:
+                raise ImportError(f"Needs QLinear to run quantize.")
+            quantize_offline(self, 4)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+    
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: bool = None,
+        **kwargs,
+    ):
+        # Load config if we don't provide a configuration
+        if not isinstance(config, PretrainedConfig):
+            config_path = config if config is not None else pretrained_model_name_or_path
+            config, model_kwargs = cls.config_class.from_pretrained(
+                config_path,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
+                force_download=force_download,
+                resume_download=False,
+                proxies=None,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                subfolder="",
+                _from_auto=False,
+                _from_pipeline=None,
+                **kwargs,
+            )
+        else:
+            model_kwargs = kwargs
+        
+        if hasattr(config, "quantization_config") and config.quantization_config['load_in_4bit']:
+            try:
+                from .quantizer import init_model_weight_int4
+                from accelerate import init_empty_weights, dispatch_model, infer_auto_device_map
+                from accelerate.utils import CustomDtype
+                from accelerate.utils import get_balanced_memory
+            except ImportError:
+                raise ImportError(f"Needs import model weight init func to run quantize.") 
+            # Instantiate model.
+            init_contexts = [no_init_weights(_enable=True)]
+            init_contexts.append(init_empty_weights())
+            with ContextManagers(init_contexts):
+                model = cls(config)
+            
+            model_file = os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin')
+            state_dict = torch.load(model_file, map_location="cpu") 
+            model.is_quantized = True
+            
+            device_map = kwargs.pop("device_map", None)
+            torch_dtype = kwargs.pop("torch_dtype", None)
+            
+            if device_map is not None:
+                kwargs = {"no_split_module_classes": model._no_split_modules}
+                target_dtype = CustomDtype.INT4
+                max_memory = get_balanced_memory(
+                    model,
+                    dtype=target_dtype,
+                    low_zero=(device_map == "balanced_low_0"),
+                    max_memory=None,
+                    **kwargs,
+                )
+                kwargs["max_memory"] = max_memory
+                device_map = infer_auto_device_map(model, dtype=target_dtype, **kwargs)
+                
+            model = init_model_weight_int4(config, model, state_dict)
+            
+            # Set model in evaluation mode to deactivate DropOut modules by default
+            model.eval()
+            # If it is a model with generation capabilities, attempt to load the generation config
+            if model.can_generate():
+                try:
+                    model.generation_config = GenerationConfig.from_pretrained(
+                        pretrained_model_name_or_path,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        resume_download=False,
+                        proxies=None,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder="",
+                        _from_auto=False,
+                        _from_pipeline=None,
+                        **kwargs,
+                    )
+                except (OSError, TypeError):
+                    logger.info(
+                        "Generation config file not found, using a generation config created from the model config."
+                    )
+                    pass
+            
+            if device_map is not None:
+                dispatch_model(model, device_map=device_map)
+            
+            return model
+        return super(BaichuanForCausalLM, cls).from_pretrained(pretrained_model_name_or_path, *model_args, 
+                config=config, cache_dir=cache_dir, ignore_mismatched_sizes=ignore_mismatched_sizes, 
+                force_download=force_download, local_files_only=local_files_only, token=token, revision=revision, 
+                use_safetensors=use_safetensors, **kwargs)   
+
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            softmax_normalizer = shift_logits.max(-1).values ** 2
+            z_loss = self.config.z_loss_weight * softmax_normalizer.mean()
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels) + z_loss
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+            self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+    def quantize(self, bits: int):
+        try:
+            from .quantizer import quantize_online
+        except ImportError:
+            raise ImportError(f"Needs QLinear to run quantize.")
+        return quantize_online(self, bits)
+
+    def chat(self, tokenizer, messages: List[dict], stream=False,
+             generation_config: Optional[GenerationConfig]=None):
+        generation_config = generation_config or self.generation_config
+        input_ids = build_chat_input(self, tokenizer, messages, generation_config.max_new_tokens)
+        if stream:
+            streamer = TextIterStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+            Thread(target=self.generate, kwargs=dict(
+                inputs=input_ids, streamer=streamer,
+                generation_config=generation_config,
+            )).start()
+            return streamer
+        else:
+            outputs = self.generate(input_ids, generation_config=generation_config)
+            response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
+            return response
diff --git a/models/Baichuan2/compile/torch_inference.py b/models/Baichuan2/compile/torch_inference.py
new file mode 100644
index 0000000..77c5319
--- /dev/null
+++ b/models/Baichuan2/compile/torch_inference.py
@@ -0,0 +1,16 @@
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.utils import GenerationConfig
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('model_path', help='下载模型的绝对路径')
+args = parser.parse_args()
+model_path = args.model_path
+tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float32, trust_remote_code=True)
+model.generation_config = GenerationConfig.from_pretrained(model_path)
+messages = []
+messages.append({"role": "user", "content": "解释一下“温故而知新”"})
+response = model.chat(tokenizer, messages)
+print(response)
\ No newline at end of file
diff --git a/models/Baichuan2/demo/CMakeLists.txt b/models/Baichuan2/demo/CMakeLists.txt
new file mode 100755
index 0000000..5acf3bf
--- /dev/null
+++ b/models/Baichuan2/demo/CMakeLists.txt
@@ -0,0 +1,38 @@
+cmake_minimum_required(VERSION 2.8)
+project(baichuan2)
+
+if (NOT DEFINED TARGET_ARCH)
+    set(TARGET_ARCH pcie)
+endif()
+
+set(CMAKE_INSTALL_PREFIX install)
+
+if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "aarch64")
+	add_definitions(-DSOC_TARGET)
+	link_directories(${PROJECT_SOURCE_DIR}/../src/lib_soc)
+	message("SoC mode, starting......")
+elseif (${TARGET_ARCH} STREQUAL "pcie")
+    add_definitions(-DPCIE_TARGET)
+    link_directories(${PROJECT_SOURCE_DIR}/../src/lib_pcie)
+	message("Pcie mode, starting......")
+elseif (${TARGET_ARCH} STREQUAL "soc")
+    add_definitions(-DSOC_TARGET)
+    set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+    set(CMAKE_ASM_COMPILER aarch64-linux-gnu-gcc)
+    set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+    link_directories(${PROJECT_SOURCE_DIR}/lib_soc)
+	message("SoC mode, starting......")
+endif()
+
+
+
+
+include_directories(${PROJECT_SOURCE_DIR}/../src/include)
+
+add_definitions(-DDEBUG --std=c++17 -fPIC -Wall -Werror)
+set(CMAKE_BUILD_TYPE "Debug")
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+add_executable(baichuan2 demo.cpp)
+target_link_libraries(baichuan2 bmrt bmlib sentencepiece)
+
diff --git a/models/Baichuan2/demo/demo.cpp b/models/Baichuan2/demo/demo.cpp
new file mode 100755
index 0000000..6956b6a
--- /dev/null
+++ b/models/Baichuan2/demo/demo.cpp
@@ -0,0 +1,472 @@
+//===----------------------------------------------------------------------===//
+//
+// Copyright (C) 2023 Sophgo Technologies Inc.  All rights reserved.
+//
+// TPU-MLIR is licensed under the 2-Clause BSD License except for the
+// third-party components.
+//
+//===----------------------------------------------------------------------===//
+
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+#include <assert.h>
+#include <chrono>
+#include <algorithm>
+#include "memory.h"
+#include "sentencepiece/sentencepiece_processor.h"
+#include "bmruntime_interface.h"
+#include <getopt.h>
+#include <numeric>
+
+static const int NUM_LAYERS = 32;
+static const int MAX_LEN = 512;
+static const float ATTENTION_MASK = -1000.;
+
+static const std::string TOKENIZER_MODEL = "../model/tokenizer.model";
+
+// #define EXPORT_RESULTS
+#ifdef EXPORT_RESULTS
+#include "cnpy.h"
+static cnpy::npz_t map;
+
+template <typename T>
+static void add_array(std::string name, bm_handle_t bm_handle,
+                      const bm_device_mem_t &dst) {
+  std::vector<T> data(dst.size / sizeof(T));
+  bm_memcpy_d2s(bm_handle, data.data(), dst);
+  cnpy::npz_add_array(map, name, data);
+}
+
+static void save_array(std::string filename) {
+  cnpy::npz_save_all(filename, map);
+}
+#endif
+
+class Baichuan2 {
+public:
+  void init(const std::vector<int> &devid, std::string model);
+  void chat();
+  void deinit();
+
+private:
+  void answer(const std::string &input_str);
+  int forward_first(std::vector<int> &tokens);
+  int forward_next();
+  void load_sentencepiece();
+
+private:
+  std::vector<bm_handle_t> handles;
+  bm_handle_t bm_handle;
+  void *p_bmrt;
+  sentencepiece::SentencePieceProcessor sentencepiece;
+  const bm_net_info_t *net_blocks[NUM_LAYERS];
+  const bm_net_info_t *net_blocks_cache[NUM_LAYERS];
+  const bm_net_info_t *net_embed;
+  const bm_net_info_t *net_embed_cache;
+  const bm_net_info_t *net_lm;
+  bm_tensor_t inputs_embed_512, outputs_embed_512;
+  bm_tensor_t inputs_lm, outputs_lm;
+  bm_tensor_t inputs_pid, next_pid, inputs_attention, next_attention;
+  bm_tensor_t past_key[NUM_LAYERS], past_value[NUM_LAYERS];
+  bm_tensor_t present_key[NUM_LAYERS], present_value[NUM_LAYERS];
+  bm_tensor_t present_key_cache, present_value_cache;
+  std::string name_embed;
+  std::string name_embed_cache;
+  std::string name_lm;
+  std::string name_blocks[NUM_LAYERS];
+  std::string name_blocks_cache[NUM_LAYERS];
+  int round = 0;
+  int token_length;
+  int EOS;
+  std::vector<std::string> history;
+};
+
+void Baichuan2::load_sentencepiece() {
+  printf("Load %s ... ", TOKENIZER_MODEL.c_str());
+  auto status = sentencepiece.Load(TOKENIZER_MODEL);
+  if (!status.ok()) {
+    std::cout << status.ToString() << std::endl;
+    exit(-1);
+  }
+  EOS = sentencepiece.eos_id();
+  printf("Done!\n");
+}
+
+void Baichuan2::init(const std::vector<int> &devices, std::string model) {
+  load_sentencepiece();
+  // request bm_handle
+  std::cout << "Device [ ";
+  for (auto d : devices) {
+    std::cout << d << " ";
+  }
+  std::cout << "] loading ....\n";
+  // int device_num = devices.size();
+  for (auto d : devices) {
+    bm_handle_t h;
+    bm_status_t status = bm_dev_request(&h, d);
+    assert(BM_SUCCESS == status);
+    handles.push_back(h);
+  }
+  bm_handle = handles[0];
+  // create bmruntime
+  p_bmrt = bmrt_create(bm_handle);
+  assert(NULL != p_bmrt);
+
+  // load bmodel by file
+  printf("Model[%s] loading ....\n", model.c_str());
+  bool ret = bmrt_load_bmodel(p_bmrt, model.c_str());
+  assert(true == ret);
+  printf("Done!\n");
+  // net names
+  name_embed = "embedding";
+  name_embed_cache = "embedding_cache";
+  name_lm = "lm_head";
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    name_blocks[i] = "block_" + std::to_string(i);
+    name_blocks_cache[i] = "block_cache_" + std::to_string(i);
+  }
+
+  // net infos
+  net_embed = bmrt_get_network_info(p_bmrt, name_embed.c_str());
+  net_embed_cache = bmrt_get_network_info(p_bmrt, name_embed_cache.c_str());
+  net_lm = bmrt_get_network_info(p_bmrt, name_lm.c_str());
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    net_blocks[i] = bmrt_get_network_info(p_bmrt, name_blocks[i].c_str());
+    net_blocks_cache[i] =
+        bmrt_get_network_info(p_bmrt, name_blocks_cache[i].c_str());
+  }
+
+  // net device mem
+  ret = bmrt_tensor(&inputs_embed_512, p_bmrt, net_embed->input_dtypes[0],
+                    net_embed->stages[0].input_shapes[0]);
+  assert(true == ret);
+
+  ret = bmrt_tensor(&outputs_embed_512, p_bmrt, net_embed->output_dtypes[0],
+                    net_embed->stages[0].output_shapes[0]);
+  assert(true == ret);
+
+  ret = bmrt_tensor(&inputs_pid, p_bmrt, net_blocks[0]->input_dtypes[1],
+                    net_blocks[0]->stages[0].input_shapes[1]);
+  assert(true == ret);
+
+  ret = bmrt_tensor(&inputs_attention, p_bmrt, net_blocks[0]->input_dtypes[2],
+                    net_blocks[0]->stages[0].input_shapes[2]);
+  assert(true == ret);
+
+  ret = bmrt_tensor(&next_pid, p_bmrt, net_blocks_cache[0]->input_dtypes[1],
+                    net_blocks_cache[0]->stages[0].input_shapes[1]);
+  assert(true == ret);
+
+  ret =
+      bmrt_tensor(&next_attention, p_bmrt, net_blocks_cache[0]->input_dtypes[2],
+                  net_blocks_cache[0]->stages[0].input_shapes[2]);
+  assert(true == ret);
+
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    ret = bmrt_tensor(&past_key[i], p_bmrt, net_blocks[0]->output_dtypes[1],
+                      net_blocks[0]->stages[0].output_shapes[1]);
+    assert(true == ret);
+    ret = bmrt_tensor(&past_value[i], p_bmrt, net_blocks[0]->output_dtypes[2],
+                      net_blocks[0]->stages[0].output_shapes[2]);
+    assert(true == ret);
+    ret = bmrt_tensor(&present_key[i], p_bmrt, net_blocks[0]->output_dtypes[1],
+                      net_blocks[0]->stages[0].output_shapes[1]);
+    assert(true == ret);
+    ret = bmrt_tensor(&present_value[i], p_bmrt, net_blocks[0]->output_dtypes[2],
+                      net_blocks[0]->stages[0].output_shapes[2]);
+    assert(true == ret);
+  }
+  ret = bmrt_tensor(&present_key_cache, p_bmrt, net_blocks_cache[0]->output_dtypes[1],
+                    net_blocks_cache[0]->stages[0].output_shapes[1]);
+  assert(true == ret);
+  ret = bmrt_tensor(&present_value_cache, p_bmrt, net_blocks_cache[0]->output_dtypes[2],
+                    net_blocks_cache[0]->stages[0].output_shapes[2]);
+  assert(true == ret);
+
+  ret = bmrt_tensor(&inputs_lm, p_bmrt, net_lm->input_dtypes[0],
+                    net_lm->stages[0].input_shapes[0]);
+  assert(true == ret);
+  ret = bmrt_tensor(&outputs_lm, p_bmrt, net_lm->output_dtypes[0],
+                    net_lm->stages[0].output_shapes[0]);
+  assert(true == ret);
+}
+
+void Baichuan2::deinit() {
+  bm_free_device(bm_handle, inputs_embed_512.device_mem);
+  bm_free_device(bm_handle, outputs_embed_512.device_mem);
+  bm_free_device(bm_handle, inputs_lm.device_mem);
+  bm_free_device(bm_handle, outputs_lm.device_mem);
+  bm_free_device(bm_handle, inputs_pid.device_mem);
+  bm_free_device(bm_handle, next_pid.device_mem);
+  bm_free_device(bm_handle, inputs_attention.device_mem);
+  bm_free_device(bm_handle, next_attention.device_mem);
+  bm_free_device(bm_handle, present_key_cache.device_mem);
+  bm_free_device(bm_handle, present_value_cache.device_mem);
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    bm_free_device(bm_handle, past_key[i].device_mem);
+    bm_free_device(bm_handle, past_value[i].device_mem);
+    bm_free_device(bm_handle, present_key[i].device_mem);
+    bm_free_device(bm_handle, present_value[i].device_mem);
+  }
+  bmrt_destroy(p_bmrt);
+  for (auto h : handles) {
+    bm_dev_free(h);
+  }
+}
+
+int Baichuan2::forward_first(std::vector<int> &tokens) {
+  int input_ids[MAX_LEN] = {0}; // start token
+  int position_id[MAX_LEN] = {0};
+  float attention_mask[MAX_LEN * MAX_LEN] = {0};
+  token_length = tokens.size();
+  
+  std::copy(tokens.begin(), tokens.end(), input_ids);
+  for (int i = 0; i < token_length; i++) {
+    position_id[i] = i;
+  }
+
+  for (int i = 0; i < MAX_LEN; i++) {
+    for (int j = 0; j < MAX_LEN; j++) {
+      if (j <= i && i < token_length) {
+      } else {
+        attention_mask[i * MAX_LEN + j] = ATTENTION_MASK;
+      }
+    }
+  }
+
+  // forward embeding
+  bm_memcpy_s2d(bm_handle, inputs_embed_512.device_mem, (void *)input_ids);
+  auto ret =
+      bmrt_launch_tensor_ex(p_bmrt, name_embed.c_str(), &inputs_embed_512, 1,
+                            &outputs_embed_512, 1, true, false);
+  assert(ret);
+  // float test_embed[MAX_LEN] = {0};
+  // bm_memcpy_d2s(bm_handle, (void *)&test_embed, outputs_embed_512.device_mem);
+  bm_thread_sync(bm_handle);
+
+  // forward blocks
+  bm_memcpy_s2d(bm_handle, inputs_pid.device_mem, (void *)position_id);
+  bm_memcpy_s2d(bm_handle, inputs_attention.device_mem, (void *)attention_mask);
+  auto inputs_embed = outputs_embed_512;
+  inputs_embed.shape = net_blocks[0]->stages[0].input_shapes[0];
+  bm_tensor_t inputs_block[3] = {inputs_embed, inputs_pid, inputs_attention};
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    bm_tensor_t outputs_block[3] = {inputs_embed, past_key[i], past_value[i]};
+    ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks[i].c_str(), inputs_block, 3,
+                                outputs_block, 3, true, false);
+    assert(ret);
+    bm_thread_sync(bm_handle);
+  }
+  int bytes = inputs_embed.device_mem.size / MAX_LEN;
+  bm_memcpy_d2d_byte(bm_handle, inputs_lm.device_mem, 0,
+                     inputs_embed.device_mem, (token_length - 1) * bytes,
+                     bytes);
+  ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm, 1,
+                              &outputs_lm, 1, true, false);
+  bm_thread_sync(bm_handle);
+  
+  int token = 0;
+  bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm.device_mem);
+  return token;
+}
+
+int Baichuan2::forward_next() {
+  float attention_mask[MAX_LEN + 1] = {0};
+  for (int i = token_length - 1; i < MAX_LEN; i++) {
+    attention_mask[i] = ATTENTION_MASK;
+  }
+  int32_t position_id = token_length - 1;
+  // embedding
+  outputs_lm.shape = net_embed_cache->stages[0].input_shapes[0];
+  auto ret = bmrt_launch_tensor_ex(p_bmrt, name_embed_cache.c_str(), &outputs_lm, 1,
+                                   &inputs_lm, 1, true, false);
+  assert(ret);
+  bm_thread_sync(bm_handle);
+
+  // blocks
+  bm_memcpy_s2d(bm_handle, next_attention.device_mem, (void *)attention_mask);
+  bm_memcpy_s2d(bm_handle, next_pid.device_mem, (void *)&position_id);
+  auto inputs_embed = inputs_lm;
+  inputs_embed.shape = net_blocks_cache[0]->stages[0].input_shapes[0];
+  int bytes = bm_mem_get_device_size(present_key_cache.device_mem); 
+  int token_offset = (token_length - 1) * bytes;
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    bm_tensor_t inputs_block[5] = {inputs_embed, next_pid, next_attention,
+                                   past_key[i], past_value[i]};
+    bm_tensor_t outputs_block[3] = {inputs_embed, present_key_cache, present_value_cache};
+    ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks_cache[i].c_str(),
+                                inputs_block, 5, outputs_block, 3, true, false);
+    assert(ret);
+    bm_thread_sync(bm_handle);
+    bm_memcpy_d2d_byte(bm_handle, past_key[i].device_mem, token_offset,
+                       present_key_cache.device_mem, 0,
+                       bytes);
+    bm_memcpy_d2d_byte(bm_handle, past_value[i].device_mem, token_offset,
+                       present_value_cache.device_mem, 0,
+                       bytes);
+  }
+  outputs_lm.shape = net_lm->stages[0].output_shapes[0];
+  ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm, 1,
+                              &outputs_lm, 1, true, false);
+  bm_thread_sync(bm_handle);
+
+  int token = 0;
+  bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm.device_mem);
+  return token;
+}
+
+void Baichuan2::chat() {
+  while (true) {
+    std::cout << "\nQuestion: ";
+    std::string input_str;
+    std::getline(std::cin, input_str);
+    std::string user_token = "<reserved_106>"; //user token id 195
+    std::string assitant_token = "<reserved_107>"; //assistant token id 196
+    if (input_str == "exit") {
+        break;
+    }
+    if (input_str == "clear") {
+        history.clear();
+        continue;
+    }
+
+    input_str = user_token + input_str + assitant_token;
+
+    std::cout << "\nAnswer: " << std::flush;
+    answer(input_str);
+    std::cout << std::endl;
+  }
+}
+
+void Baichuan2::answer(const std::string &input_str) {
+  int tok_num = 0;
+  history.emplace_back(std::move(input_str));
+
+  std::vector<int> tokens;
+
+  std::string history_input = std::accumulate(history.begin(), history.end(), std::string());
+  sentencepiece.Encode(history_input, &tokens);
+
+  if (tokens.empty()) {
+    printf("Sorry: your question is too wierd!!\n");
+    history.clear();
+    round = 0;
+    return;
+  }
+  // make sure token not too large
+  if (tokens.size() > MAX_LEN - 10) {
+    // reset
+    if (round == 0) {
+      printf("Error: your question is too large!\n");
+      return;
+    }
+    round = 0;
+    history.clear();
+    answer(input_str);
+    return;
+  }
+  auto time_1 = std::chrono::system_clock::now();
+  int pre_token = 0;
+  int token = forward_first(tokens);
+  auto time_2 = std::chrono::system_clock::now();
+  std::string result;
+  while (token != EOS && token_length < MAX_LEN) {
+    std::string pre_word;
+    std::string word;
+    std::vector<int> pre_ids = {pre_token};
+    std::vector<int> ids = {pre_token, token};
+    sentencepiece.Decode(pre_ids, &pre_word);
+    sentencepiece.Decode(ids, &word);
+    std::string diff = word.substr(pre_word.size());
+    result += diff;
+    std::cout << diff << std::flush;
+    if (token_length < MAX_LEN) {
+      token_length++;
+    }
+    tok_num++;
+    token = forward_next();
+  }
+  auto time_3 = std::chrono::system_clock::now();
+  auto ftl_dur =
+      std::chrono::duration_cast<std::chrono::microseconds>(time_2 - time_1);
+  auto tps_dur =
+      std::chrono::duration_cast<std::chrono::microseconds>(time_3 - time_2);
+  double tps = tok_num / (tps_dur.count() * 1e-6);
+  if (token_length >= MAX_LEN) {
+    printf(" ......\nWarning: cleanup early history\n");
+  }
+  // double tht = tokens.size() / (tht_dur.count() * 1e-6);
+  printf("\nFTL:%f s, TPS: %f tokens/s\n", ftl_dur.count() * 1e-6, tps);
+  history.emplace_back(result);
+  if (token_length + 128 >= MAX_LEN) {
+    int num = (history.size() + 3) / 4 * 2;
+    history.erase(history.begin(), history.begin() + num);
+  }
+}
+
+static void split(const std::string &s, const std::string &delim,
+                  std::vector<std::string> &ret) {
+  size_t last = 0;
+  size_t index = s.find_first_of(delim, last);
+  while (index != std::string::npos) {
+    ret.push_back(s.substr(last, index - last));
+    last = index + 1;
+    index = s.find_first_of(delim, last);
+  }
+  if (last < s.length()) {
+    ret.push_back(s.substr(last));
+  }
+}
+
+static std::vector<int> parseCascadeDevices(const std::string &str) {
+  std::vector<int> devices;
+  std::vector<std::string> sub_str;
+  split(str, ",", sub_str);
+  for (auto &s : sub_str) {
+    devices.push_back(std::atoi(s.c_str()));
+  }
+  return devices;
+}
+
+void processArguments(int argc, char *argv[], std::string &baichuan_model,
+                      std::vector<int> &devices) {
+  struct option longOptions[] = {{"model", required_argument, nullptr, 'm'},
+                                 {"dev_id", required_argument, nullptr, 'd'},
+                                 {nullptr, 0, nullptr, 0}};
+
+  int optionIndex = 0;
+  int option;
+
+  while ((option = getopt_long(argc, argv, "m:d:", longOptions,
+                               &optionIndex)) != -1) {
+    switch (option) {
+    case 'm':
+      baichuan_model = optarg;
+      break;
+    case 'd':
+      devices = parseCascadeDevices(optarg);
+      break;
+    case '?':
+      exit(EXIT_FAILURE);
+    default:
+      exit(EXIT_FAILURE);
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  // set your bmodel path here
+  printf("Demo for Baichuan2-7B in BM1684X\n");
+  std::string baichuan_model = "baichuan2-7b-test.bmodel";
+  std::vector<int> devices = {0};
+  processArguments(argc, argv, baichuan_model, devices);
+
+  Baichuan2 baichuan;
+  printf("Init Environment ...\n");
+  baichuan.init(devices, baichuan_model);
+  printf("==========================\n");
+  baichuan.chat();
+  baichuan.deinit();
+  return 0;
+}
diff --git a/models/Baichuan2/requirements.txt b/models/Baichuan2/requirements.txt
new file mode 100755
index 0000000..4708ef2
--- /dev/null
+++ b/models/Baichuan2/requirements.txt
@@ -0,0 +1,7 @@
+torch==2.1.2
+transformers==4.36.2
+sentencepiece==0.1.99
+gradio==3.39.0
+mdtex2html==1.2.0
+accelerate
+onnx
diff --git a/models/Baichuan2/src/include/bmdef.h b/models/Baichuan2/src/include/bmdef.h
new file mode 100644
index 0000000..d41a4b0
--- /dev/null
+++ b/models/Baichuan2/src/include/bmdef.h
@@ -0,0 +1,129 @@
+/*****************************************************************************
+ *
+ *    Copyright (c) 2016-2026 by Sophgo Technologies Inc. All rights reserved.
+ *
+ *    The material in this file is confidential and contains trade secrets
+ *    of Sophgo Technologies Inc. This is proprietary information owned by
+ *    Sophgo Technologies Inc. No part of this work may be disclosed,
+ *    reproduced, copied, transmitted, or used in any way for any purpose,
+ *    without the express written permission of Sophgo Technologies Inc.
+ *
+ *****************************************************************************/
+
+#ifndef __BMRUNTIME_DEFINE_H__
+#define __BMRUNTIME_DEFINE_H__
+
+#include "bmlib_runtime.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* --------------------------------------------------------------------------*/
+/* basic definitions */
+
+/* bm_data_type_t holds the type for a scalar value */
+typedef enum bm_data_type_e {
+  BM_FLOAT32 = 0,
+  BM_FLOAT16 = 1,
+  BM_INT8 = 2,
+  BM_UINT8 = 3,
+  BM_INT16 = 4,
+  BM_UINT16 = 5,
+  BM_INT32 = 6,
+  BM_UINT32 = 7,
+  BM_BFLOAT16 = 8,
+  BM_INT4 = 9,
+  BM_UINT4 = 10,
+} bm_data_type_t;
+
+/* store mode definitions */
+typedef enum bm_store_mode_e {
+  BM_STORE_1N = 0, /* default, if not sure, use 0 */
+  BM_STORE_2N = 1,
+  BM_STORE_4N = 2,
+} bm_store_mode_t;
+
+/* bm_shape_t holds the shape info */
+#define BM_MAX_DIMS_NUM 8
+typedef struct bm_shape_s {
+  int num_dims;
+  int dims[BM_MAX_DIMS_NUM];
+} bm_shape_t;
+
+typedef struct bm_shape_ex_s {
+  bm_shape_t shape;
+  int        elem_num;
+} bm_shape_ex_t;
+
+/*
+bm_tensor_t holds a multi-dimensional array of elements of a single data type
+and tensor are in device memory */
+typedef struct bm_tensor_s {
+  bm_data_type_t dtype;
+  bm_shape_t shape;
+  bm_device_mem_t device_mem;
+  bm_store_mode_t st_mode; /* user can set 0 as default store mode */
+} bm_tensor_t;
+
+/* --------------------------------------------------------------------------*/
+/* network information structure */
+
+/* bm_stage_info_t holds input/output shapes and device mems; every network can contain one or more
+ * stages */
+typedef struct bm_stage_info_s {
+  bm_shape_t *input_shapes;  /* input_shapes[0] / [1] / ... / [input_num-1] */
+  bm_shape_t *output_shapes; /* output_shapes[0] / [1] / ... / [output_num-1] */
+  bm_device_mem_t *input_mems; /* input_mems[0] / [1] / ... / [input_num-1] */
+  bm_device_mem_t *output_mems; /* output_mems[0] / [1] / ... / [output_num-1] */
+} bm_stage_info_t;
+
+/* bm_tensor_info_t holds all information of one net.
+ * scale for float type is 1.0 as default */
+typedef struct bm_net_info_s {
+  const char* name;              /* net name */
+  bool is_dynamic;               /* dynamic or static */
+  int input_num;                 /* number of inputs */
+  char const** input_names;      /* input_names[0] / [1] / .../ [input_num-1] */
+  bm_data_type_t* input_dtypes;  /* input_dtypes[0] / [1] / .../ [input_num-1] */
+  float* input_scales;           /* input_scales[0] / [1] / .../ [input_num-1] */
+  int output_num;                /* number of outputs */
+  char const** output_names;     /* output_names[0] / [1] / .../ [output_num-1] */
+  bm_data_type_t* output_dtypes; /* output_dtypes[0] / [1] / .../ [output_num-1] */
+  float* output_scales;          /* output_scales[0] / [1] / .../ [output_num-1] */
+  int stage_num;                 /* number of stages */
+  bm_stage_info_t* stages;       /* stages[0] / [1] / ... / [stage_num-1] */
+  size_t* max_input_bytes;       /* max_input_bytes[0]/ [1] / ... / [input_num-1] */
+  size_t* max_output_bytes;      /* max_output_bytes[0] / [1] / ... / [output_num-1] */
+  int* input_zero_point;         /* input_zero_point[0] / [1] / .../ [input_num-1] */
+  int* output_zero_point;        /* output_zero_point[0] / [1] / .../ [output_num-1] */
+  int *input_loc_devices;         /* input_loc_device[0] / [1] / .../ [input_num-1] */
+  int *output_loc_devices;        /* output_loc_device[0] / [1] / .../ [output_num-1] */
+} bm_net_info_t;
+
+typedef struct api_info_s {
+  /// @brief api_id to be sent to driver
+  int32_t api_id;
+  /// @brief api data to be sent to driver
+  uint8_t **api_data;
+  /// @brief size of the api data to be sent to driver
+  size_t api_data_size;
+  /// @brief subsize of the api data to be sent to driver
+  size_t *api_data_subsize;
+  /// @brief offset of input tensors' addr in api_data
+  uint32_t *input_addr_offset;
+  /// @brief number of the offset of input tensors' addr in api_data
+  size_t input_addr_offset_number;
+  /// @brief offset of output tensors' addr in api_data
+  uint32_t *output_addr_offset;
+  /// @brief number of the offset of output tensors' addr in api_data
+  size_t output_addr_offset_number;
+} api_info_c;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __BM_NET_H__ */
diff --git a/models/Baichuan2/src/include/bmlib_runtime.h b/models/Baichuan2/src/include/bmlib_runtime.h
new file mode 100644
index 0000000..071cfe0
--- /dev/null
+++ b/models/Baichuan2/src/include/bmlib_runtime.h
@@ -0,0 +1,2581 @@
+/*****************************************************************************
+ *
+ *    Copyright (c) 2016-2026 by Bitmain Technologies Inc. All rights reserved.
+ *
+ *    The material in this file is confidential and contains trade secrets
+ *    of Bitmain Technologies Inc. This is proprietary information owned by
+ *    Bitmain Technologies Inc. No part of this work may be disclosed,
+ *    reproduced, copied, transmitted, or used in any way for any purpose,
+ *    without the express written permission of Bitmain Technologies Inc.
+ *
+ *****************************************************************************/
+
+/**************************************************************************
+ * bmlib_runtime defines interfaces that operate TPU devices.
+ * The functions can be divided into serveral categories.
+ * 1) device handle creation and destroy
+ * 2) memory help functions
+ * 3) global memory allocation and free
+ * 4) data transfer between host and device
+ * 5) data transfer within device memory
+ * 6) api send and synchronization
+ * 7) global memory map and coherence
+ * 8) trace and profile
+ * 9) power management
+ * 10) miscellaneous functions
+ *************************************************************************/
+
+#ifndef BMLIB_RUNTIME_H_
+#define BMLIB_RUNTIME_H_
+#if defined(_WIN32) && !defined(__MINGW32__)
+    #include <vadefs.h>
+    #define DECL_EXPORT __declspec(dllexport)
+    #define DECL_IMPORT __declspec(dllimport)
+#else
+	#include <stdbool.h>
+	#include <stddef.h>
+	#include <stdarg.h>
+    #define DECL_EXPORT
+    #define DECL_IMPORT
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef enum {
+  MODULE_CDMA = 0,
+  MODULE_GDMA = 1,
+  MODULE_TPU = 2,
+  MODULE_SMMU = 3,
+  MODULE_SRAM = 4,
+  MODULE_END = 5
+} MODULE_ID;
+
+#define BM_MEM_ADDR_NULL (0xfffffffff)
+
+#ifndef BM_MEM_DESC_T_
+#define BM_MEM_DESC_T_
+/* BM function return code definitions */
+typedef enum {
+  BM_SUCCESS = 0,
+  BM_ERR_DEVNOTREADY = 1, /* Device not ready yet */
+  BM_ERR_FAILURE = 2,     /* General failure */
+  BM_ERR_TIMEOUT = 3,     /* Timeout */
+  BM_ERR_PARAM = 4,       /* Parameters invalid */
+  BM_ERR_NOMEM = 5,       /* Not enough memory */
+  BM_ERR_DATA = 6,        /* Data error */
+  BM_ERR_BUSY = 7,        /* Busy */
+  BM_ERR_NOFEATURE = 8,   /* Not supported yet */
+  BM_NOT_SUPPORTED = 9
+} bm_status_t;
+
+/* BM memory type definitions */
+typedef enum {
+  BM_MEM_TYPE_DEVICE = 0,
+  BM_MEM_TYPE_HOST = 1,
+  BM_MEM_TYPE_SYSTEM = 2,
+  BM_MEM_TYPE_INT8_DEVICE = 3,
+  BM_MEM_TYPE_INVALID = 4
+} bm_mem_type_t;
+
+typedef enum {
+  PERF_MONITOR_GDMA = 0,
+  PERF_MONITOR_TPU = 1
+} PERF_MONITOR_ID;
+
+typedef enum {
+  BMCPU_IDLE    = 0,
+  BMCPU_RUNNING = 1,
+  BMCPU_FAULT   = 2
+} bm_cpu_status_t;
+
+/*
+* bm performace monitor
+*/
+typedef struct bm_perf_monitor {
+  long long buffer_start_addr; /*buffer address to store perf data*/
+  int buffer_size; /*buffer size*/
+  PERF_MONITOR_ID monitor_id; /*PERF_MONITOR_GDMA or PERF_MONITOR_TPU*/
+} bm_perf_monitor_t;
+
+typedef union {
+  struct {
+    bm_mem_type_t mem_type : 3;
+    unsigned int gmem_heapid : 3;
+    unsigned int reserved : 26;
+  } u;
+  unsigned int rawflags;
+} bm_mem_flags_t;
+
+/* BM memory descriptor definition*/
+typedef struct bm_mem_desc {
+  union {
+    struct {
+#ifdef __linux__
+      unsigned long device_addr;
+#else
+      unsigned long long device_addr;
+#endif
+      unsigned int reserved;
+      int dmabuf_fd;
+    } device;
+
+    struct {
+      void *system_addr;
+      unsigned int reserved0;
+      int reserved1;
+    } system;
+  } u;
+
+  bm_mem_flags_t flags;
+  unsigned int size;
+} bm_mem_desc_t;
+
+typedef struct bm_mem_desc bm_device_mem_t;
+typedef struct bm_mem_desc bm_system_mem_t;
+
+typedef struct sg_mem_desc {
+  union {
+    struct {
+#ifdef __linux__
+      unsigned long device_addr;
+#else
+      unsigned long long device_addr;
+#endif
+      unsigned int reserved;
+      int dmabuf_fd;
+    } device;
+
+    struct {
+      void *system_addr;
+      unsigned int reserved0;
+      int reserved1;
+    } system;
+  } u;
+
+  bm_mem_flags_t flags;
+  unsigned long long size;
+} sg_mem_desc_t;
+
+typedef struct sg_mem_desc sg_device_mem_t;
+typedef struct sg_mem_desc sg_system_mem_t;
+#endif
+
+struct bm_context;
+typedef struct bm_context *bm_handle_t;
+
+#define MD5SUM_LEN 16
+#define LIB_MAX_NAME_LEN 64
+#define FUNC_MAX_NAME_LEN 64
+
+typedef struct bm_module
+{
+  // void *lib_handle;
+  char lib_name[LIB_MAX_NAME_LEN];
+  unsigned char md5[MD5SUM_LEN];
+}bm_module;
+
+typedef struct bm_module *tpu_kernel_module_t;
+typedef int tpu_kernel_function_t;
+
+/**
+ * @name    tpu_kernel_load_module_file
+ * @brief   To load dyn file
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  module_file     dyn file
+ * @retval  dyn lib ptr
+ */
+tpu_kernel_module_t tpu_kernel_load_module_file(bm_handle_t handle, const char *module_file);
+
+/**
+ * @name    tpu_kernel_load_module_file_key
+ * @brief   To load dyn file with key
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  module_file     dyn file
+ * @param [in]  key             identification str
+ * @param [in]  size            key size
+ * @retval  dyn lib ptr
+ */
+tpu_kernel_module_t tpu_kernel_load_module_file_key(bm_handle_t handle, const char *module_file, const char *key, int size);
+
+/**
+ * @name    tpu_kernel_unload_module
+ * @brief   To unload dyn file
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  p_module        dyn lib ptr
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_unload_module(bm_handle_t handle, tpu_kernel_module_t p_module);
+
+/**
+ * @name    tpu_kernel_free_module
+ * @brief   To free p_module when not use
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  p_module        dyn lib ptr
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_free_module(bm_handle_t handle, tpu_kernel_module_t p_module);
+
+/**
+ * @name    tpu_kernel_load_module
+ * @brief   To load dyn module
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  data            dyn module
+ * @param [in]  length          dyn module size
+ * @retval  dyn lib ptr
+ */
+tpu_kernel_module_t tpu_kernel_load_module(bm_handle_t handle, const char *data, size_t length);
+
+/**
+ * @name    tpu_kernel_get_function
+ * @brief   To get function from lib
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  module          dyn module
+ * @param [in]  function        funtion name
+ * @retval  function id
+ */
+tpu_kernel_function_t tpu_kernel_get_function(bm_handle_t handle, tpu_kernel_module_t module, const char *function);
+
+/**
+ * @name    tpu_kernel_launch
+ * @brief   To launch function with sync
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  function        function id
+ * @param [in]  args            funtion args
+ * @param [in]  size            args size
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_launch(bm_handle_t handle, tpu_kernel_function_t function, void *args, size_t size);
+
+/**
+ * @name    tpu_kernel_launch_async
+ * @brief   To launch function with async
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  function        function id
+ * @param [in]  args            funtion args
+ * @param [in]  size            args size
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_launch_async(bm_handle_t handle, tpu_kernel_function_t function, void *args, size_t size);
+
+/**
+ * @name    tpu_kernel_launch_async_multi_cores
+ * @brief   To launch function with async for multi cores
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  func_name       function name
+ * @param [in]  api_param       funtion params
+ * @param [in]  api_size        params size
+ * @param [in]  core_list       list of core ids
+ * @param [in]  core_num        number of cores
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_launch_async_multi_cores(bm_handle_t handle, const char *func_name, const void *api_param,
+                                                size_t api_size, const int* core_list, const int core_num);
+
+/**
+ * @name    tpu_kernel_launch_sync_multi_cores
+ * @brief   To launch function with sync for multi cores
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  func_name       function name
+ * @param [in]  api_param       funtion params
+ * @param [in]  api_size        params size
+ * @param [in]  core_list       list of core ids
+ * @param [in]  core_num        number of cores
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_launch_sync_multi_cores(bm_handle_t handle, const char *func_name, const void *api_param,
+                                              size_t api_size, const int* core_list, const int core_num);
+
+/**
+ * @name    tpu_kernel_sync
+ * @brief   To sync
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_sync(bm_handle_t handle);
+void show_md5(unsigned char md5[]);
+
+DECL_EXPORT void bmlib_log(const char *tag, int level, const char *fmt, ...);
+
+#ifndef USING_CMODEL
+#define BM_CHECK_RET(call)                                                    \
+  do {                                                                        \
+    bm_status_t ret = (bm_status_t)call;                                                   \
+    if (ret != BM_SUCCESS) {                                                  \
+      bmlib_log("BM_CHECK",16,"BM_CHECK_RET fail %s: %s: %d\n", __FILE__, __func__, __LINE__); \
+      return ret;                                                             \
+    }                                                                         \
+  } while (0)
+#else
+#define BM_CHECK_RET(call)                     \
+  do {                                         \
+    bm_status_t ret = call;                    \
+    if (ret != BM_SUCCESS) {                   \
+      bmlib_log("BM_CHECK",16,"BM_CHECK_RET failed %d\n", ret);\
+      ASSERT(0);                               \
+      exit(-ret);                              \
+    }                                          \
+  } while (0)
+#endif
+
+/*******************handle releated functions *********************************/
+/**
+ * @name    bm_dev_getcount
+ * @brief   To get the number of sophon devices in system.
+ *          If N is got, valid devid is [0, N-1]
+ * @ingroup bmlib_runtime
+ *
+ * @param [out] count  The result number of sophon devices
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_dev_getcount(int *count);
+
+/**
+ * @name    bm_dev_query
+ * @brief   To query if a device is present
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] devid  The id of the device to query
+ * @retval  BM_SUCCESS Device is present
+ *          Other code Devcie is not present
+ */
+DECL_EXPORT bm_status_t bm_dev_query(int devid);
+
+/**
+ * @name    bm_dev_request
+ * @brief   To create a handle for the given device
+ * @ingroup bmlib_runtime
+ *
+ * @param [out] handle  The created handle
+ * @param [in]  devid   Specify on which device to create handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_dev_request(bm_handle_t *handle, int devid);
+
+/**
+ * @name    bm_get_devid
+ * @brief   To get device index for the given handle
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The given handle
+ * @retval  int  device index that the handle points to.
+ */
+DECL_EXPORT int bm_get_devid(bm_handle_t handle);
+
+/**
+ * @name    bm_dev_free
+ * @brief   To free a handle
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The handle to free
+ */
+DECL_EXPORT void bm_dev_free(bm_handle_t handle);
+
+/*******************memory help functions ************************************/
+/**
+ * @name    bm_mem_get_type
+ * @brief   To get a memory descriptor's type
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem  The memory descriptor queried
+ * @retval  BM_MEM_TYPE_DEVICE  Device global memory
+ * @retval  BM_MEM_TYPE_SYSTEM  Host user memory
+ */
+DECL_EXPORT bm_mem_type_t bm_mem_get_type(struct bm_mem_desc mem);
+
+/**
+ * @name    sg_mem_get_type
+ * @brief   To get a memory descriptor's type
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem  The memory descriptor queried
+ * @retval  BM_MEM_TYPE_DEVICE  Device global memory
+ * @retval  BM_MEM_TYPE_SYSTEM  Host user memory
+ */
+DECL_EXPORT bm_mem_type_t sg_mem_get_type(struct sg_mem_desc mem);
+
+/**
+ * @name    bm_mem_get_device_addr
+ * @brief   To get a device memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem  The device memory descriptor queried
+ * @retval  unsigned long long  The device memory address
+ */
+DECL_EXPORT unsigned long long bm_mem_get_device_addr(struct bm_mem_desc mem);
+
+/**
+ * @name    sg_mem_get_device_addr
+ * @brief   To get a device memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem  The device memory descriptor queried
+ * @retval  unsigned long long  The device memory address
+ */
+DECL_EXPORT unsigned long long sg_mem_get_device_addr(struct sg_mem_desc mem);
+
+/**
+ * @name    bm_mem_set_device_addr
+ * @brief   To set a device memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  pmem   The device memory descriptor pointer
+ * @param ]in]  addr  The new device address of the device memory
+ */
+DECL_EXPORT void bm_mem_set_device_addr(struct bm_mem_desc* pmem, unsigned long long addr);
+
+/**
+ * @name    sg_mem_set_device_addr
+ * @brief   To set a device memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  pmem   The device memory descriptor pointer
+ * @param ]in]  addr  The new device address of the device memory
+ */
+DECL_EXPORT void sg_mem_set_device_addr(struct sg_mem_desc* pmem, unsigned long long addr);
+
+/**
+ * @name    bm_mem_get_device_size
+ * @brief   To get a device memory descriptor's size
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem      The device memory descriptor queried
+ * @retval unsigned int  The device memory's size in bytes
+ */
+DECL_EXPORT unsigned int bm_mem_get_device_size(struct bm_mem_desc mem);
+
+/**
+ * @name    sg_mem_get_device_size
+ * @brief   To get a device memory descriptor's size
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem      The device memory descriptor queried
+ * @retval unsigned int  The device memory's size in bytes
+ */
+DECL_EXPORT unsigned long long sg_mem_get_device_size(struct sg_mem_desc mem);
+
+/**
+ * @name    bm_mem_set_device_size
+ * @brief   To set a device memory descriptor's size
+ * @ingroup bmlib_runtime
+ *
+ * @param [out]  pmem  The device memory descriptor pointer
+ * @param [in]  size  The new device memory size (in bytes) of the device memory
+ */
+DECL_EXPORT void bm_mem_set_device_size(struct bm_mem_desc* pmem, unsigned int size);
+
+/**
+ * @name    sg_mem_set_device_size
+ * @brief   To set a device memory descriptor's size
+ * @ingroup bmlib_runtime
+ *
+ * @param [out]  pmem  The device memory descriptor pointer
+ * @param [in]  size  The new device memory size (in bytes) of the device memory
+ */
+DECL_EXPORT void sg_mem_set_device_size(struct sg_mem_desc* pmem, unsigned long long size);
+
+/**
+ * @name    bm_set_device_mem
+ * @brief   To fill in a device memory descriptor with size and address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] pmem  The device memory descriptor pointer
+ * @param [in]  size  The device memory descriptor's size
+ * @param [in]  addr  The device memory descriptor's address
+ */
+DECL_EXPORT void bm_set_device_mem(bm_device_mem_t* pmem, unsigned int size,
+                       unsigned long long addr);
+
+/**
+ * @name    sg_set_device_mem
+ * @brief   To fill in a device memory descriptor with size and address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] pmem  The device memory descriptor pointer
+ * @param [in]  size  The device memory descriptor's size
+ * @param [in]  addr  The device memory descriptor's address
+ */
+DECL_EXPORT void sg_set_device_mem(sg_device_mem_t* pmem, unsigned long long size,
+                       unsigned long long addr);
+
+/**
+ * @name    bm_mem_from_device
+ * @brief   To create a device memory descriptor from address and size
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] device_addr The device memory address
+ * @param [in] len         The device memory size
+ * @retval bm_device_mem_t The device memory descriptor created
+ */
+DECL_EXPORT bm_device_mem_t bm_mem_from_device(unsigned long long device_addr,
+                                   unsigned int len);
+
+/**
+ * @name    sg_mem_from_device
+ * @brief   To create a device memory descriptor from address and size
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] device_addr The device memory address
+ * @param [in] len         The device memory size
+ * @retval bm_device_mem_t The device memory descriptor created
+ */
+DECL_EXPORT sg_device_mem_t sg_mem_from_device(unsigned long long device_addr,
+                                   unsigned long long len);
+
+/**
+ * @name    bm_mem_get_system_addr
+ * @brief   To get a system memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] mem  The system memory descriptor
+ * @retval void *   The system memory descriptor's address
+ */
+DECL_EXPORT void *bm_mem_get_system_addr(struct bm_mem_desc mem);
+
+/**
+ * @name    sg_mem_get_system_addr
+ * @brief   To get a system memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] mem  The system memory descriptor
+ * @retval void *   The system memory descriptor's address
+ */
+DECL_EXPORT void *sg_mem_get_system_addr(struct sg_mem_desc mem);
+
+/**
+ * @name    bm_mem_set_system_addr
+ * @brief   To set a system memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  pmem  The system memory descriptor pointer
+ * @param [in]   addr The system memory address
+ */
+DECL_EXPORT void bm_mem_set_system_addr(struct bm_mem_desc* pmem, void *addr);
+
+/**
+ * @name    sg_mem_set_system_addr
+ * @brief   To set a system memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  pmem  The system memory descriptor pointer
+ * @param [in]   addr The system memory address
+ */
+DECL_EXPORT void sg_mem_set_system_addr(struct sg_mem_desc* pmem, void *addr);
+
+/**
+ * @name    bm_mem_from_system
+ * @brief   To create a system memory descriptor with the given system address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  system_addr  The system address in the descriptor
+ * @retval  bm_system_mem_t  The system memory descriptor created
+ */
+DECL_EXPORT bm_system_mem_t bm_mem_from_system(void *system_addr);
+
+/*******************memory alloc and free functions ***************************/
+/**
+ * @name    bm_mem_null
+ * @brief   Return an illegal device memory descriptor
+ * @ingroup bmlib_runtime
+ *
+ * @retval  bm_device_mem_t  An invalid device memory descriptor
+ */
+DECL_EXPORT bm_device_mem_t bm_mem_null(void);
+#define BM_MEM_NULL (bm_mem_null())
+
+/**
+ * @name    bm_malloc_neuron_device
+ * @brief   To malloc device memory according to a tensor shape
+ *          (each neuron is 32 bits)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result devcie memory descriptor
+ * @param [in]  n, c, h, w  The shape of the input tensor
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_malloc_neuron_device(bm_handle_t handle, bm_device_mem_t *pmem,
+                                    int n, int c, int h, int w);
+
+/**
+ * @name    sg_malloc_neuron_device
+ * @brief   To malloc device memory according to a tensor shape
+ *          (each neuron is 32 bits)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result devcie memory descriptor
+ * @param [in]  n, c, h, w  The shape of the input tensor
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_malloc_neuron_device(bm_handle_t handle, sg_device_mem_t *pmem,
+                                    unsigned long long n, unsigned long long c,
+                                    unsigned long long h, unsigned long long w);
+
+/**
+ * @name    bm_malloc_device_dword
+ * @brief   To malloc device memory in size of dword (32 bits)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]   count  The number of dwords(32bits) to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_malloc_device_dword(bm_handle_t handle, bm_device_mem_t *pmem,
+                                   int count);
+
+/**
+ * @name    sg_malloc_device_dword
+ * @brief   To malloc device memory in size of dword (32 bits)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]   count  The number of dwords(32bits) to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_malloc_device_dword(bm_handle_t handle, sg_device_mem_t *pmem,
+                                   unsigned long long count);
+
+/**
+ * @name    bm_malloc_device_byte
+ * @brief   To malloc device memory in size of byte
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_malloc_device_byte(bm_handle_t handle, bm_device_mem_t *pmem,
+                                  unsigned int size);
+
+/**
+ * @name    sg_malloc_device_byte
+ * @brief   To malloc device memory in size of byte
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_malloc_device_byte(bm_handle_t handle, sg_device_mem_t *pmem,
+                                  unsigned long long size);
+
+/**
+ * @name    bm_malloc_device_byte_heap
+ * @brief   To malloc device memory in size of byte within the specified heap
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]  heap_id The heap where to allocate  0/1/2
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_malloc_device_byte_heap(bm_handle_t handle, bm_device_mem_t *pmem,
+                                  int heap_id, unsigned int size);
+
+/**
+ * @name    sg_malloc_device_byte_heap
+ * @brief   To malloc device memory in size of byte within the specified heap
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]  heap_id The heap where to allocate  0/1/2
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_malloc_device_byte_heap(bm_handle_t handle, sg_device_mem_t *pmem,
+                                  int heap_id, unsigned long long size);
+
+/**
+ * @name    bm_malloc_device_byte_heap_mask
+ * @brief   To malloc device memory in size of byte within the specified heaps
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]  heap_id_mask The mask which heaps allocate from. each bit indicate one heap
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_malloc_device_byte_heap_mask(bm_handle_t handle, bm_device_mem_t *pmem,
+                                  int heap_id_mask, unsigned int size);
+
+/**
+ * @name    sg_malloc_device_byte_heap_mask
+ * @brief   To malloc device memory in size of byte within the specified heaps
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]  heap_id_mask The mask which heaps allocate from. each bit indicate one heap
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_malloc_device_byte_heap_mask(bm_handle_t handle, sg_device_mem_t *pmem,
+                                  int heap_id_mask, unsigned long long size);
+
+/**
+ * @name    bm_free_device
+ * @brief   To free device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  mem     The device memory descriptor to free
+ */
+DECL_EXPORT void bm_free_device(bm_handle_t handle, bm_device_mem_t mem);
+
+/**
+ * @name    sg_free_device
+ * @brief   To free device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  mem     The device memory descriptor to free
+ */
+DECL_EXPORT void sg_free_device(bm_handle_t handle, sg_device_mem_t mem);
+
+/**
+ * @name    bm_gmem_arm_reserved_request
+ * @brief   To obtain the address of global memory reserved for arm926
+ * @param [in]  handle  The device handle
+ *
+ * @retval unsigned long long  The absolute address of gmem reserved for arm926
+ */
+DECL_EXPORT unsigned long long bm_gmem_arm_reserved_request(bm_handle_t handle);
+
+/**
+ * @name    bm_gmem_arm_reserved_release
+ * @brief   To release the global memory reserved for arm926
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ */
+DECL_EXPORT void bm_gmem_arm_reserved_release(bm_handle_t handle);
+
+/*******************memory copy functions *************************************/
+/**
+ * @name    bm_memcpy_s2d
+ * @brief   To copy data from system memory to device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in] dst     The destination memory (device memory descriptor )
+ * @param [in] src     The source memory (system memory, a void* pointer)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_s2d(bm_handle_t handle, bm_device_mem_t dst, void *src);
+
+/**
+ * @name    bm_memcpy_p2p
+ * @brief   To copy data from one chip to another chip
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle_src The source device handle
+ * @param [in] src        The source memory (device memory descriptor )
+ * @param [in] handle_dst The destination device handle
+ * @param [in] dst        The destination memory (device memory descriptor )
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_p2p(bm_handle_t handle_src, bm_device_mem_t src, bm_handle_t handle_dst,bm_device_mem_t dst);
+
+/**
+ * @name    sg_memcpy_s2d
+ * @brief   To copy data from system memory to device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in] dst     The destination memory (device memory descriptor )
+ * @param [in] src     The source memory (system memory, a void* pointer)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_s2d(bm_handle_t handle, sg_device_mem_t dst, void *src);
+
+/**
+ * @name    bm_memcpy_s2d_partial_offset
+ * @brief   To copy specified bytes of data from system memory to device memory
+ *          with an offset in device memory address.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (device memory descriptor)
+ * @param [in]  src    The source memory (system memory, a void* pointer)
+ * @param [in] size    The size of data to copy (in bytes)
+ * @param [in] offset  The offset of the device memory address
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_s2d_partial_offset(bm_handle_t handle,
+                                         bm_device_mem_t dst, void *src,
+                                         unsigned int size,
+                                         unsigned int offset);
+
+/**
+ * @name    sg_memcpy_s2d_partial_offset
+ * @brief   To copy specified bytes of data from system memory to device memory
+ *          with an offset in device memory address.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (device memory descriptor)
+ * @param [in]  src    The source memory (system memory, a void* pointer)
+ * @param [in] size    The size of data to copy (in bytes)
+ * @param [in] offset  The offset of the device memory address
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_s2d_partial_offset(bm_handle_t handle,
+                                         sg_device_mem_t dst, void *src,
+                                         unsigned long long size,
+                                         unsigned long long offset);
+
+/**
+ * @name    bm_memcpy_s2d_partial
+ * @brief   To copy specified bytes of data from system memory to device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (device memory descriptor)
+ * @param [in]  src    The source memory (system memory, a void* pointer)
+ * @param [in] size    The size of data to copy (in bytes)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_s2d_partial(bm_handle_t handle, bm_device_mem_t dst,
+                                  void *src, unsigned int size);
+
+/**
+ * @name    sg_memcpy_s2d_partial
+ * @brief   To copy specified bytes of data from system memory to device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (device memory descriptor)
+ * @param [in]  src    The source memory (system memory, a void* pointer)
+ * @param [in] size    The size of data to copy (in bytes)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_s2d_partial(bm_handle_t handle, sg_device_mem_t dst,
+                                  void *src, unsigned long long size);
+
+/**
+ * @name    bm_memcpy_d2s
+ * @brief   To copy data from device memory to system memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2s(bm_handle_t handle, void *dst, bm_device_mem_t src);
+
+/**
+ * @name    sg_memcpy_d2s
+ * @brief   To copy data from device memory to system memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_d2s(bm_handle_t handle, void *dst, sg_device_mem_t src);
+
+/**
+ * @name    bm_memcpy_d2s_partial_offset
+ * @brief   To copy specified bytes of data from device memory to system memory
+ *          with an offset in device memory address.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ * @param [in] size    The size of data to copy (in bytes)
+ * @param [in] offset  The offset of the device memory address
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2s_partial_offset(bm_handle_t handle, void *dst,
+                                         bm_device_mem_t src, unsigned int size,
+                                         unsigned int offset);
+
+/**
+ * @name    sg_memcpy_d2s_partial_offset
+ * @brief   To copy specified bytes of data from device memory to system memory
+ *          with an offset in device memory address.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ * @param [in] size    The size of data to copy (in bytes)
+ * @param [in] offset  The offset of the device memory address
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_d2s_partial_offset(bm_handle_t handle, void *dst,
+                                         sg_device_mem_t src, unsigned long long size,
+                                         unsigned long long offset);
+
+/**
+ * @name    bm_memcpy_d2s_partial
+ * @brief   To copy specified bytes of data from device memory to system memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ * @param [in] size    The size of data to copy (in bytes)
+ *
+ * @retval  BM_SUCCESS  Data transfer succeeds.
+ *          Other code  Data transfer fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2s_partial(bm_handle_t handle, void *dst,
+                                  bm_device_mem_t src, unsigned int size);
+
+/**
+ * @name    sg_memcpy_d2s_partial
+ * @brief   To copy specified bytes of data from device memory to system memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ * @param [in] size    The size of data to copy (in bytes)
+ *
+ * @retval  BM_SUCCESS  Data transfer succeeds.
+ *          Other code  Data transfer fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_d2s_partial(bm_handle_t handle, void *dst,
+                                  sg_device_mem_t src, unsigned long long size);
+
+/**
+ * @name    bm_memcpy_d2d
+ * @brief   To copy specified dwords of data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle     The device handle
+ * @param [in]  dst       The destination device memory
+ * @param [in] dst_offset The offset of destination device memory address
+ * @param [in]  src       The source device memory
+ * @param [in] src_offset The offset of source device memory address
+ * @param [in]  len       Length of data to copy (in DWORD 4 bytes)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d(bm_handle_t handle, bm_device_mem_t dst,
+                          int dst_offset, bm_device_mem_t src, int src_offset,
+                          int len);
+
+/**
+ * @name    bm_memcpy_d2d_with_core
+ * @brief   To copy specified dwords of data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle     The device handle
+ * @param [in]  dst       The destination device memory
+ * @param [in] dst_offset The offset of destination device memory address
+ * @param [in]  src       The source device memory
+ * @param [in] src_offset The offset of source device memory address
+ * @param [in]  len       Length of data to copy (in DWORD 4 bytes)
+ * @param [in] core_id    The core id to copy
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d_with_core(bm_handle_t handle, bm_device_mem_t dst,
+                          int dst_offset, bm_device_mem_t src, int src_offset,
+                          int len, int core_id);
+
+/**
+ * @name    bm_memcpy_d2d_byte
+ * @brief   To copy specified bytes of data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle     The device handle
+ * @param [in]  dst       The destination device memory
+ * @param [in] dst_offset The offset of destination device memory address (in bytes)
+ * @param [in]  src       The source device memory
+ * @param [in] src_offset The offset of source device memory address (in bytes)
+ * @param [in]  size      Size of data to copy (in bytes)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d_byte(bm_handle_t handle, bm_device_mem_t dst,
+                               size_t dst_offset, bm_device_mem_t src,
+                               size_t src_offset, size_t size);
+
+/**
+ * @name    bm_memcpy_d2d_byte_with_core
+ * @brief   To copy specified bytes of data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle     The device handle
+ * @param [in]  dst       The destination device memory
+ * @param [in] dst_offset The offset of destination device memory address (in bytes)
+ * @param [in]  src       The source device memory
+ * @param [in] src_offset The offset of source device memory address (in bytes)
+ * @param [in]  size      Size of data to copy (in bytes)
+ * @param [in] core_id    The core id to copy
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d_byte_with_core(bm_handle_t handle, bm_device_mem_t dst,
+                               size_t dst_offset, bm_device_mem_t src,
+                               size_t src_offset, size_t size, int core_id);
+
+/**
+ * @name    bm_memcpy_d2d_stride
+ * @brief   To copy specified data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle      The device handle
+ * @param [in] dst         The destination device memory
+ * @param [in] dst_stride  The data stride of destination data
+ * @param [in] src         The source device memory
+ * @param [in] src_stride  The data stride of source data
+ * @param [in] count       Count of data to copy
+ * @param [in] format_size Data format byte size, such as sizeof(uint8_t), sizeof(float), etc.
+ *                         format_size only support 1/2/4.
+ *
+ * dst_stride MUST be 1, EXCEPT: dst_stride == 4 && src_stride == 1 && format_size ==1
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d_stride(bm_handle_t     handle,
+                                 bm_device_mem_t dst,
+                                 int             dst_stride,
+                                 bm_device_mem_t src,
+                                 int             src_stride,
+                                 int             count,
+                                 int             format_size);
+
+/**
+ * @name    bm_memcpy_d2d_stride
+ * @brief   To copy specified data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle      The device handle
+ * @param [in] dst         The destination device memory
+ * @param [in] dst_stride  The data stride of destination data
+ * @param [in] src         The source device memory
+ * @param [in] src_stride  The data stride of source data
+ * @param [in] count       Count of data to copy
+ * @param [in] format_size Data format byte size, such as sizeof(uint8_t), sizeof(float), etc.
+ *                         format_size only support 1/2/4.
+ * @param [in] core_id     The core id to copy.
+ *
+ * dst_stride MUST be 1, EXCEPT: dst_stride == 4 && src_stride == 1 && format_size ==1
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d_stride_with_core(bm_handle_t     handle,
+                                 bm_device_mem_t dst,
+                                 int             dst_stride,
+                                 bm_device_mem_t src,
+                                 int             src_stride,
+                                 int             count,
+                                 int             format_size,
+                                 int             core_id);
+
+/**
+ * @name    bm_memcpy_c2c
+ * @brief   To copy data from one chip to another chip.
+ *          (Used in multi-chip card scenario)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] src_handle The source device handle
+ * @param [in] dst_handle The destination device handle
+ * @param [in] src        The source device memory descriptor
+ * @param [in] dst        The destination device memory descriptor
+ * @param [in] force_dst_cdma If use the CDMA engine of the destination device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_c2c(bm_handle_t src_handle, bm_handle_t dst_handle,
+                          bm_device_mem_t src, bm_device_mem_t dst,
+                          bool force_dst_cdma);
+
+/**
+ * @name    bm_memset_device
+ * @brief   To fill in specified device memory with the given value
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   value  The value used to fill. (int type)
+ * @param [in]  mem     The device memory which will be filled in
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memset_device(bm_handle_t handle, const int value,
+                             bm_device_mem_t mem);
+
+/**
+ * @name    bm_memset_device_ext
+ * @brief   To fill in specified device memory with the given value and mode
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   value  The pointer of value used to fill
+ * @param [in]   mode   The valid bytes of *value
+ * @param [in]  mem     The device memory which will be filled in
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memset_device_ext(bm_handle_t handle, void* value, int mode,
+                             bm_device_mem_t mem);
+
+/**
+ * @name    bm_mem_convert_system_to_device_neuron
+ * @brief   To malloc a piece of device memory according to the shape of
+ *          neuron(in DWORD 4 bytes); copy neuron from system memory to
+ *          device memory if need_copy is true.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory descriptor
+ * @param [in]  sys_mem The system memory descriptor
+ * @param [in]  need_copy If copy from system to device is needed
+ * @param [in]  n,c,h,w  Neuron shape size
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_neuron(bm_handle_t handle,
+                                                   struct bm_mem_desc *dev_mem,
+                                                   struct bm_mem_desc sys_mem,
+                                                   bool need_copy, int n, int c,
+                                                   int h, int w);
+
+/**
+ * @name    bm_mem_convert_system_to_device_neuron_byte
+ * @brief   To malloc a piece of device memory according to the shape of
+ *          neuron(in bytes); copy neuron from system memory to
+ *          device memory if need_copy is true.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory descriptor
+ * @param [in]  sys_mem The system memory descriptor
+ * @param [in]  need_copy If copy from system to device is needed
+ * @param [in]  n,c,h,w  Neuron shape size
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_neuron_byte(
+    bm_handle_t handle, struct bm_mem_desc *dev_mem, struct bm_mem_desc sys_mem,
+    bool need_copy, int n, int c, int h, int w);
+
+/**
+ * @name    bm_mem_convert_system_to_device_coeff
+ * @brief   To malloc a piece of device memory according to the size of
+ *          coefficient (in DWORD 4 bytes); copy coefficient from system
+ *          memory to device memory if need_copy is true.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory descriptor
+ * @param [in]  sys_mem The system memory descriptor
+ * @param [in]  need_copy If copy from system to device is needed
+ * @param [in]  coeff_count Coefficient size
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_coeff(bm_handle_t handle,
+                                                  struct bm_mem_desc *dev_mem,
+                                                  struct bm_mem_desc sys_mem,
+                                                  bool need_copy,
+                                                  int coeff_count);
+/**
+ * @name    bm_mem_convert_system_to_device_coeff_byte
+ * @brief   To malloc a piece of device memory according to the size of
+ *          coefficient (in bytes); copy coefficient from system
+ *          memory to device memory if need_copy is true.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory descriptor
+ * @param [in]  sys_mem The system memory descriptor
+ * @param [in]  need_copy If copy from system to device is needed
+ * @param [in]  coeff_count Coefficient size
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_coeff_byte(
+    bm_handle_t handle, struct bm_mem_desc *dev_mem, struct bm_mem_desc sys_mem,
+    bool need_copy, int coeff_count);
+
+/*******************memory map functions *************************************/
+/**
+ * @name    bm_mem_mmap_device_mem
+ * @brief   To map a piece of device memory to user space with cache enabled.
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory to map
+ * @param [out] vmem    The virtual address of the mapped device memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_mmap_device_mem(bm_handle_t handle, bm_device_mem_t *dmem,
+
+        unsigned long long *vmem);
+
+/**
+ * @name    sg_mem_mmap_device_mem
+ * @brief   To map a piece of device memory to user space with cache enabled.
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory to map
+ * @param [out] vmem    The virtual address of the mapped device memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_mmap_device_mem(bm_handle_t handle, sg_device_mem_t *dmem,
+        unsigned long long *vmem);
+
+/*******************memory map functions *************************************/
+/**
+ * @name    bm_mem_mmap_device_mem_no_cache
+ * @brief   To map a piece of device memory to user space with cache disabled.
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory to map
+ * @param [out] vmem    The virtual address of the mapped device memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_mmap_device_mem_no_cache(bm_handle_t handle, bm_device_mem_t *dmem,
+
+        unsigned long long *vmem);
+
+/**
+ * @name    sg_mem_mmap_device_mem_no_cache
+ * @brief   To map a piece of device memory to user space with cache disabled.
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory to map
+ * @param [out] vmem    The virtual address of the mapped device memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_mmap_device_mem_no_cache(bm_handle_t handle, sg_device_mem_t *dmem,
+        unsigned long long *vmem);
+
+/**
+ * @name    bm_mem_vir_to_phy
+ * @brief   To get device mem address through the mapped virtual address .
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  vmem    The virtual address of the mapped device memory
+ * @param [out]  dev_mem The device memory address
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_vir_to_phy(bm_handle_t handle, unsigned long long vmem,
+        unsigned long long *device_mem);
+/**
+ * @name    bm_mem_invalidate_device_mem
+ * @brief   To invalidate a piece of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to invalidate
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+
+DECL_EXPORT bm_status_t bm_mem_invalidate_device_mem(bm_handle_t handle,
+                                         bm_device_mem_t *dmem);
+
+/**
+ * @name    sg_mem_invalidate_device_mem
+ * @brief   To invalidate a piece of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to invalidate
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+
+DECL_EXPORT bm_status_t sg_mem_invalidate_device_mem(bm_handle_t handle,
+                                         sg_device_mem_t *dmem);
+
+/**
+ * @name    bm_mem_invalidate_partial_device_mem
+ * @brief   To invalidate part of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to invalidate
+ * @param [in]  offset  The offset of device memory address
+ * @param [in]  len     The length of memory to invalidate in bytes
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_invalidate_partial_device_mem(bm_handle_t handle,
+                                                 bm_device_mem_t *dmem,
+                                                 unsigned int offset,
+                                                 unsigned int len);
+
+/**
+ * @name    sg_mem_invalidate_partial_device_mem
+ * @brief   To invalidate part of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to invalidate
+ * @param [in]  offset  The offset of device memory address
+ * @param [in]  len     The length of memory to invalidate in bytes
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_invalidate_partial_device_mem(bm_handle_t handle,
+                                                 sg_device_mem_t *dmem,
+                                                 unsigned long long offset,
+                                                 unsigned long long len);
+
+/**
+ * @name    bm_mem_flush_device_mem
+ * @brief   To flush a piece of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to flush
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_flush_device_mem(bm_handle_t handle, bm_device_mem_t *dmem);
+
+/**
+ * @name    sg_mem_flush_device_mem
+ * @brief   To flush a piece of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to flush
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_flush_device_mem(bm_handle_t handle, sg_device_mem_t *dmem);
+
+/**
+ * @name    bm_mem_flush_partial_device_mem
+ * @brief   To flush part of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to flush
+ * @param [in]  offset  The offset of device memory address
+ * @param [in]  len     The length of memory to flush in bytes
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_flush_partial_device_mem(bm_handle_t handle,
+                                            bm_device_mem_t *dmem,
+                                            unsigned int offset,
+                                            unsigned int len);
+
+/**
+ * @name    sg_mem_flush_partial_device_mem
+ * @brief   To flush part of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to flush
+ * @param [in]  offset  The offset of device memory address
+ * @param [in]  len     The length of memory to flush in bytes
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_flush_partial_device_mem(bm_handle_t handle,
+                                            sg_device_mem_t *dmem,
+                                            unsigned long long offset,
+                                            unsigned long long len);
+
+/**
+ * @name    bm_mem_unmap_device_mem
+ * @brief   To unmap a piece of mapped device memory
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   vmem   The virtual address of the mapped device memory
+ * @param [in]  size    The size of unmapped memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_unmap_device_mem(bm_handle_t handle, void *vmem, int size);
+
+/**
+ * @name    sg_mem_unmap_device_mem
+ * @brief   To unmap a piece of mapped device memory
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   vmem   The virtual address of the mapped device memory
+ * @param [in]  size    The size of unmapped memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_unmap_device_mem(bm_handle_t handle, void *vmem, unsigned long long size);
+
+/*******************api(kernel) functions *************************************/
+/**
+ * @name    bm_flush
+ * @brief   To synchronize APIs of the current thread. The thread will block
+ *          until all the outstanding APIs of the current thread are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ */
+DECL_EXPORT void bm_flush(bm_handle_t handle);
+
+/**
+ * @name    bm_device_sync
+ * @brief   To synchronize APIs of the device. The thread will block
+ *          until all the outstanding APIs of the device are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle   The device handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_device_sync(bm_handle_t handle);
+
+/**
+ * @name    bm_handle_sync
+ * @brief   To synchronize APIs of the handle. The thread will block
+ *          until all the outstanding APIs of the handle are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle   The device handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_handle_sync(bm_handle_t handle);
+
+/**
+ * @name    bm_handle_sync_from_core
+ * @brief   To synchronize APIs of the handle. The thread will block
+ *          until all the outstanding APIs of the handle are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle   The device handle
+ * @param [in] core_id  The core id
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_handle_sync_from_core(bm_handle_t handle, int core_id);
+
+/**
+ * @name    bm_thread_sync
+ * @brief   To synchronize APIs of the current thread. The thread will block
+ *          until all the outstanding APIs of the current thread are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @retval  BM_SUCCESS Succeeds.
+ *          Other code Fails.
+ */
+DECL_EXPORT bm_status_t bm_thread_sync(bm_handle_t handle);
+
+/**
+ * @name    bm_thread_sync_from_core
+ * @brief   To synchronize APIs of the current thread. The thread will block
+ *          until all the outstanding APIs of the current thread are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in] core_id The core id
+ * @retval  BM_SUCCESS Succeeds.
+ *          Other code Fails.
+ */
+DECL_EXPORT bm_status_t bm_thread_sync_from_core(bm_handle_t handle, int core_id);
+
+/*******************trace and profile releated functions **********************/
+typedef struct bm_profile {
+#ifdef __linux__
+  unsigned long cdma_in_time;
+  unsigned long cdma_in_counter;
+  unsigned long cdma_out_time;
+  unsigned long cdma_out_counter;
+  unsigned long tpu_process_time;
+  unsigned long tpu1_process_time;
+  unsigned long sent_api_counter;
+  unsigned long completed_api_counter;
+#else
+  unsigned long long cdma_in_time;
+  unsigned long long cdma_in_counter;
+  unsigned long long cdma_out_time;
+  unsigned long long cdma_out_counter;
+  unsigned long long tpu_process_time;
+  unsigned long long tpu1_process_time;
+  unsigned long long sent_api_counter;
+  unsigned long long completed_api_counter;
+#endif
+} bm_profile_t;
+/**
+ * @name    bm_get_profile
+ * @brief   To get the profile data at the moment
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] profile The result profile data
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_profile(bm_handle_t handle, bm_profile_t *profile);
+
+typedef struct bootloader_version{
+	char *bl1_version;
+	char *bl2_version;
+	char *bl31_version;
+	char *uboot_version;
+} boot_loader_version;
+
+/**
+ * @name    bm_get_boot_loader_version
+ * @brief   To get the boot_loader_version
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] version The result version data
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_boot_loader_version(bm_handle_t handle, boot_loader_version *version);
+
+/**
+ * @name    bm_get_vpu_instant_usage
+ * @brief   To get vpu usage
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] smi_attr The result vpu usage
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_vpu_instant_usage(bm_handle_t handle, int *vpu_usage);
+
+/**
+ * @name    bm_get_jpu_core_usage
+ * @brief   To get the jpu usage
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] smi_attr The result jpu usage
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_jpu_core_usage(bm_handle_t handle, int *jpu_usage);
+
+/**
+ * @name    bm_get_vpp_instant_usage
+ * @brief   To get the vpp usage
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] smi_attr The result vpp usage
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_vpp_instant_usage(bm_handle_t handle, int *vpp_usage);
+/**
+ * @name    bm_get_last_api_process_time_us
+ * @brief   This function is abandoned.
+ */
+#ifdef __linux__
+DECL_EXPORT bm_status_t bm_get_last_api_process_time_us(bm_handle_t handle,
+                                            unsigned long *time_us);
+#else
+DECL_EXPORT bm_status_t bm_get_last_api_process_time_us(bm_handle_t handle,
+											unsigned long long *time_us);
+#endif
+/*******************tpu clock and module reset releated functions *************/
+
+/**
+ * @name    bm_set_clk_tpu_freq
+ * @brief   To set the clock frequency of TPU (only valid in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   freq   The TPU target frequency
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_set_clk_tpu_freq(bm_handle_t handle, int freq);
+
+/**
+ * @name    bm_get_clk_tpu_freq
+ * @brief   To get the clock frequency of TPU
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  freq   The current TPU frequency
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_clk_tpu_freq(bm_handle_t handle, int *freq);
+
+/*******************misc functions ********************************************/
+struct bm_misc_info {
+  int pcie_soc_mode;  /*0---pcie; 1---soc*/
+  int ddr_ecc_enable; /*0---disable; 1---enable*/
+  long long ddr0a_size;
+  long long ddr0b_size;
+  long long ddr1_size;
+  long long ddr2_size;
+  unsigned int chipid;
+#define BM1682_CHIPID_BIT_MASK (0X1 << 0)
+#define BM1684_CHIPID_BIT_MASK (0X1 << 1)
+#define BM1686_CHIPID_BIT_MASK (0X1 << 2)
+#ifdef __linux__
+  unsigned long chipid_bit_mask;
+#else
+	unsigned long long chipid_bit_mask;
+#endif
+  unsigned int driver_version;
+  int domain_bdf;
+  int board_version; /*hardware board version [23:16]-mcu sw version, [15:8]-board type, [7:0]-hw version*/
+  int a53_enable;
+  int dyn_enable;
+};
+
+/**
+ * @name    bm_get_misc_info
+ * @brief   To get miscellaneous information of the device
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle     The device handle
+ * @param [out] pmisc_info The fetched misc info
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_misc_info(bm_handle_t handle, struct bm_misc_info *pmisc_info);
+
+/**
+ * @name    bm_get_chipid
+ * @brief   To get the chipid of the device. (0x1682 / 0x1684 / 0x168?)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle    The device handle
+ * @param [out] p_chipid The chip id of the device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_chipid(bm_handle_t handle, unsigned int *p_chipid);
+
+#define BMLIB_LOG_QUIET    -8
+#define BMLIB_LOG_PANIC     0
+#define BMLIB_LOG_FATAL     8
+#define BMLIB_LOG_ERROR    16
+#define BMLIB_LOG_WARNING  24
+#define BMLIB_LOG_INFO     32
+#define BMLIB_LOG_VERBOSE  40
+#define BMLIB_LOG_DEBUG    48
+#define BMLIB_LOG_TRACE    56
+
+/**
+ * @name    bmlib_log_get_level
+ * @brief   To get the bmlib log level
+ * @ingroup bmlib_log
+ *
+ * @param void
+ * @retval  The level of bmlib log level
+ */
+DECL_EXPORT int  bmlib_log_get_level(void);
+
+/**
+ * @name    bmlib_log_set_level
+ * @brief   To set the bmlib log level
+ * @ingroup bmlib_log
+ *
+ * @param [in] level    The level of bmlib log level
+ * @retval  void
+ */
+DECL_EXPORT void bmlib_log_set_level(int level);
+
+/**
+ * @name    bmlib_log_set_callback
+ * @brief   To set callback to get bmlib log
+ * @ingroup bmlib_log
+ *
+ * @param [in]  callback     The callback function to get bmlib log
+ * @retval  void
+ */
+DECL_EXPORT void bmlib_log_set_callback(void (*callback)(const char*, int, const char*, va_list args));
+
+/**
+ * @name    bm_set_debug_mode
+ * @brief   To set the debug mode for firmware log for tpu
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  mode    The debug mode of fw log, 0/1 for disable/enable log
+ * @retval  void
+ */
+DECL_EXPORT void bm_set_debug_mode(bm_handle_t handle, int mode);
+
+/**
+ * @name    bmlib_api_dbg_callback
+ * @brief   To set debug callback to get firmware log
+ * @ingroup bmlib_log
+ *
+ * @param [in]  bmlib_api_dbg_callback  callback to get firmware log
+ * @retval  void
+ */
+typedef void (*bmlib_api_dbg_callback)(int, int, int, const char*);
+// api, result, duratioin, log, third int for api duration for future
+DECL_EXPORT void bmlib_set_api_dbg_callback(bmlib_api_dbg_callback callback);
+
+/**
+ * @name    bmcpu_get_cpu_status
+ * @brief   Get bmcpu status
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @retval  BMCPU_RUNNING  bmcpu is running.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_cpu_status_t bmcpu_get_cpu_status(bm_handle_t handle);
+
+/**
+ * @name    bmcpu_start_cpu
+ * @brief   Start cpu in pcie mode
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  boot_file       Fip file
+ * @param [in]  core_file       Itb file
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_start_cpu(bm_handle_t handle, char *boot_file, char *core_file);
+
+/**
+ * @name    bmcpu_open_process
+ * @brief   Open a process to do some work
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  flags           Process flags
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  >= 0 process handle
+ *          < 0  Other code Fails.
+ */
+DECL_EXPORT int bmcpu_open_process(bm_handle_t handle, unsigned int flags, int timeout);
+
+/**
+ * @name    bmcpu_load_library
+ * @brief   Load a share library(so) to specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  library_file    Library file path
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_load_library(bm_handle_t handle, int process_handle, char *library_file, int timeout);
+
+/**
+ * @name    bmcpu_unload_library
+ * @brief   Load a share library(so) to specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  library_file    Library file path
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_unload_library(bm_handle_t handle, int process_handle, char *library_file, int timeout);
+
+/**
+ * @name    bmcpu_exec_function
+ * @brief   Execute specific function in specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  function_name   Function name
+ * @param [in]  function_param  Function parameters
+ * @param [in]  param_size      Parameters size in bytes
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  0   success.
+ *          >0  code fails from bmlib
+ *          <0  code fails from function
+ */
+DECL_EXPORT int bmcpu_exec_function(bm_handle_t handle,
+                     int process_handle,
+                     char *function_name,
+                     void *function_param,
+                     unsigned int param_size,
+                     int timeout);
+
+#define BMCPU_EXEC_OPT_NO_FLUSH_CACHE     1
+/**
+ * @name    bmcpu_exec_function_ext
+ * @brief   Execute specific function in specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  function_name   Function name
+ * @param [in]  function_param  Function parameters
+ * @param [in]  param_size      Parameters size in bytes
+ * @param [in]  opt             exec options
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  0   success.
+ *          >0  code fails from bmlib
+ *          <0  code fails from function
+ */
+DECL_EXPORT int bmcpu_exec_function_ext(bm_handle_t  handle,
+                            int process_handle,
+                            char *function_name,
+                            void *function_param,
+                            unsigned int param_size,
+                            unsigned int opt,
+                            int timeout);
+
+/**
+ * @name    bmcpu_exec_function_async
+ * @brief   Execute specific function in specific process asynchronous
+ *          user should use bm_query_exec_function_result to query result
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  function_name   Function name
+ * @param [in]  function_param  Function param
+ * @param [in]  param_size      Param size in bytes
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_exec_function_async(bm_handle_t handle,
+                                   int process_handle,
+                                   char *function_name,
+                                   void *function_param,
+                                   unsigned int param_size,
+                                   unsigned long long *api_handle);
+
+/**
+ * @name    bmcpu_exec_function_async_ext
+ * @brief   Execute specific function in specific process asynchronous
+ *          user should use bm_query_exec_function_result to query result
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  function_name   Function name
+ * @param [in]  function_param  Function param
+ * @param [in]  param_size      Param size in bytes
+ * @param [in]  opt             exec options
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_exec_function_async_ext(bm_handle_t handle,
+                                          int process_handle,
+                                          char *function_name,
+                                          void *function_param,
+                                          unsigned int param_size,
+                                          unsigned int opt,
+                                          unsigned long long *api_handle);
+
+/**
+ * @name    bmcpu_query_exec_function_result
+ * @brief   Query result from function called by bm_exec_function
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  api_handle      Api handle return by bm_exec_function_async
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  0   success.
+ *          >0  code fails from bmlib
+ *          <0  code fails from function
+ */
+DECL_EXPORT int bmcpu_query_exec_function_result(bm_handle_t handle, unsigned long long api_handle, int timeout);
+
+/**
+ * @name    bmcpu_map_phys_addr
+ * @brief   Map physical address in specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  phys_addr       Physical address
+ * @param [in]  size            Map size in bytes
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  >0  virtual address
+ *          0   fails
+ */
+DECL_EXPORT void *bmcpu_map_phys_addr(bm_handle_t handle, int process_handle, void *phys_addr, unsigned int size, int timeout);
+
+/**
+ * @name    bmcpu_unmap_phys_addr
+ * @brief   Unmap physical address in specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  phys_addr       Physical address
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  <0  fail
+ *          0   success
+ */
+DECL_EXPORT bm_status_t bmcpu_unmap_phys_addr(bm_handle_t handle, int process_handle, void *phys_addr, int timeout);
+
+/**
+ * @name    bmcpu_close_process
+ * @brief   Close process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_close_process(bm_handle_t handle, int process_handle, int timeout);
+
+/**
+ * @name    bmcpu_reset_cpu
+ * @brief   Reset cpu in pcie mode
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_reset_cpu(bm_handle_t handle);
+
+/**
+ * @name    bm_enable_perf_monitor
+ * @brief   enable perf monitor to get gdma and tpu performance data
+ * @ingroup bmlib_perf
+ *
+ * @param [in]  handle         The device handle
+ * @param [in]  perf_monitor   The monitor to perf
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_enable_perf_monitor(bm_handle_t handle, bm_perf_monitor_t *perf_monitor);
+
+/**
+ * @name    bm_disable_perf_monitor
+ * @brief   disable perf monitor to get gdma and tpu performance data
+ * @ingroup bmlib_perf
+ *
+ * @param [in]  handle         The device handle
+ * @param [in]  perf_monitor   The monitor to perf
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_disable_perf_monitor(bm_handle_t handle, bm_perf_monitor_t *perf_monitor);
+
+/**
+ * @name    bmcpu_set_log
+ * @brief   Set cpu log options
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  log_level       0: DEBUG  1:INFO 2:WARN 3:ERROR 4:FATAL
+ * @param [in]  log_to_console  1: YES  0: No
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_set_log(bm_handle_t handle, unsigned int log_level,  unsigned int log_to_console, int timeout);
+
+/**
+ * @name    bmcpu_get_log
+ * @brief   Get cpu log file
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  log_file        save log as file
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_get_log(bm_handle_t handle, int process_handle, char *log_file, int timeout);
+
+/**
+ * @name    bmcpu_sync_time
+ * @brief   Sync device cpu time with host
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_sync_time(bm_handle_t handle);
+
+/*******************trace and profile releated functions **********************/
+struct bm_heap_stat {
+  unsigned int mem_total;
+  unsigned int mem_avail;
+  unsigned int mem_used;
+};
+
+typedef struct bm_heap_stat_byte {
+  unsigned int  heap_id;
+  unsigned long long mem_total;
+  unsigned long long mem_avail;
+  unsigned long long mem_used;
+  unsigned long long mem_start_addr;
+} bm_heap_stat_byte_t;
+
+typedef struct bm_dev_stat {
+  int mem_total;
+  int mem_used;
+  int tpu_util;
+  int heap_num;
+  struct bm_heap_stat heap_stat[4];
+} bm_dev_stat_t;
+
+/**
+ * @name    bm_get_stat
+ * @brief   To get the stat data at the moment
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] profile The result stat data
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_stat(bm_handle_t handle, bm_dev_stat_t *stat);
+
+/**
+ * @name    bm_get_gmem_heap_id
+ * @brief   To get the heap id of allocated global memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  pmem The allocted global memory
+ * @param [out] heapid The result of get heap id
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+
+DECL_EXPORT bm_status_t bm_get_gmem_heap_id(bm_handle_t handle, bm_device_mem_t *pmem, unsigned int *heapid);
+
+/**
+ * @name    sg_get_gmem_heap_id
+ * @brief   To get the heap id of allocated global memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  pmem The allocted global memory
+ * @param [out] heapid The result of get heap id
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+
+DECL_EXPORT bm_status_t sg_get_gmem_heap_id(bm_handle_t handle, sg_device_mem_t *pmem, unsigned int *heapid);
+
+/**
+ * @name    bm_get_gmem_total_heap_num
+ * @brief   To get the total heap num of global memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  heap_num The result of get total num
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_gmem_total_heap_num(bm_handle_t handle, unsigned int *heap_num);
+
+/**
+ * @name    bm_get_gmem_heap_stat_byte_by_id
+ * @brief   To get the heap stat by heap id
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  heap_id The heap index to get heap status
+ * @param [out] pheap_byte The result of get heap status
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_gmem_heap_stat_byte_by_id(bm_handle_t handle, bm_heap_stat_byte_t *pheap_byte, unsigned int heap_id);
+
+DECL_EXPORT bm_status_t bm_load_firmware(
+        bm_handle_t  handle,
+        const char  *firmware_tcm,
+        const char  *firmware_ddr);
+
+#define bmkernel_load_firmware okkernel_load_firmware
+DECL_EXPORT bm_status_t okkernel_load_firmware(
+        bm_handle_t  handle,
+        const char  *firmware_tcm,
+        const char  *firmware_ddr);
+
+DECL_EXPORT bm_status_t okkernel_launch_async(
+        bm_handle_t   handle,
+        const char   *func_name,
+        const void   *args,
+        unsigned int  size);
+
+DECL_EXPORT bm_status_t okkernel_launch_sync(
+        bm_handle_t   handle,
+        const char   *func_name,
+        const void   *args,
+        unsigned int  size);
+
+DECL_EXPORT bm_status_t tpu_kernel_launch_sync(
+        bm_handle_t   handle,
+        const char   *func_name,
+        const void   *args,
+        unsigned int  size);
+
+DECL_EXPORT bm_status_t okkernel_sync(bm_handle_t handle);
+
+/**
+ * @name    bmkernel_launch
+ * @brief   send api to device and launch function
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  api cmd struct pointer
+ * @param [in]  api cmd length
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmkernel_launch(bm_handle_t handle, const void *args,
+                            unsigned int size);
+
+/**
+ * @name    bmkernel_load_lookup_table
+ * @brief   load lookup table to l2-sram
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  table which loaded to l2-sram
+ * @param [in]  table size
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmkernel_load_lookup_table(bm_handle_t handle, const void* table, unsigned int size);
+
+/*******************device management api functions ********************************************/
+/**
+ * @name    bm_get_tpu_current
+ * @brief   get tpu current
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]   handle     The device handle
+ * @param [out]  tpuc(mA)   The pointer for tpu current
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_current(bm_handle_t handle, unsigned int *tpuc);
+
+/**
+ * @name    bm_get_board_max_power
+ * @brief   get board support max power
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  maxp    The pointer for maxp
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_board_max_power(bm_handle_t handle, unsigned int *maxp);
+
+/**
+ * @name    bm_get_board_power
+ * @brief   get board power
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]   handle    The device handle
+ * @param [out]  boardp    The pointer for boardp
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_board_power(bm_handle_t handle, unsigned int *boardp);
+
+/**
+ * @name    bm_get_fan_speed
+ * @brief   get board fan speed
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  fan    The pointer for fan speed
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_fan_speed(bm_handle_t handle, unsigned int *fan);
+
+/**
+ * @name    bm_get_ecc_correct_num
+ * @brief   get ecc_correct_num
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  ecc_correct_num
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+#ifdef __linux__
+DECL_EXPORT bm_status_t bm_get_ecc_correct_num(bm_handle_t handle, unsigned long *ecc_correct_num);
+#else
+DECL_EXPORT bm_status_t bm_get_ecc_correct_num(bm_handle_t handle, unsigned long long *ecc_correct_num);
+#endif
+/**
+ * @name    bm_get_12v_atx
+ * @brief   get atx_12v
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  atx_12v
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_12v_atx(bm_handle_t handle, int *atx_12v);
+
+/**
+ * @name    bm_get_product_sn
+ * @brief   get SE5 sn
+ * @ingroup device management api
+ *
+ * @param [out]  product_sn
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_product_sn(char *product_sn);
+
+/**
+ * @name    bm_get_sn
+ * @brief   get sn
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  sn
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_sn(bm_handle_t handle, char *sn);
+
+/**
+ * @name    bm_get_status
+ * @brief   get chip status
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  status  The board error status, each bit represents an error state
+ *  status == 0x0, borad is nornal, staus > 0, borad is abnormal;
+ *  bit0 == 1, tpu is hang
+ *  bit1 == 1, pcie link abnormal
+ *  bit2 == 1, board temperature is too high
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_status(bm_handle_t handle, int *status);
+
+/**
+ * @name    bm_get_tpu_maxclk
+ * @brief   get tpu_maxclk
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  tpu_maxclk
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_maxclk(bm_handle_t handle, unsigned int *tpu_maxclk);
+
+/**
+ * @name    bm_get_tpu_minclk
+ * @brief   get tpu_minclk
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  tpu_minclk
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_minclk(bm_handle_t handle, unsigned int *tpu_minclk);
+
+/**
+ * @name    bm_get_driver_version
+ * @brief   get driver version
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  driver_version
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_driver_version(bm_handle_t handle, int *driver_version);
+
+/**
+ * @name    bm_get_board_name
+ * @brief   get device board name
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  board_name
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_board_name(bm_handle_t handle, char *name);
+
+/**
+ * @name    bm_get_board_temp
+ * @brief   get board temperature
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  board_temp
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_board_temp(bm_handle_t handle, unsigned int *board_temp);
+
+/**
+ * @name    bm_get_chip_temp
+ * @brief   get chip temperature
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  chip_temp
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_chip_temp(bm_handle_t handle, unsigned int *chip_temp);
+
+/**
+ * @name    bm_get_tpu_power
+ * @brief   get TPU power
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  tpu_power
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_power(bm_handle_t handle, float *tpu_power);
+
+/**
+ * @name    bm_get_tpu_volt
+ * @brief   get TPU voltage
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  tpu_volt
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_volt(bm_handle_t handle, unsigned int *tpu_volt);
+
+/**
+ * @name    bm_get_card_id
+ * @brief   get card id
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  card_id
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_card_id(bm_handle_t handle, unsigned int *card_id);
+
+/**
+ * @name    bm_get_card_num
+ * @brief   get card number
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  card_id
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_card_num(unsigned int *card_num);
+
+/**
+ * @name    bm_get_chip_num_from_card
+ * @brief   get chip number and start chip id from card
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  chip_num
+ * @param [out]  dev_start_index
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_chip_num_from_card(unsigned int card_id, unsigned int *chip_num, unsigned int *dev_start_index);
+
+/**
+ * @name    bm_get_dynfreq_status
+ * @brief   get chip dynamic freq status
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  dynfreq_status
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_dynfreq_status(bm_handle_t handle, int *dynfreq_status);
+
+/**
+ * @name    bm_change_dynfreq_status
+ * @brief   change(enable/disable) chip dynamic freq status
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [in]   new_status
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_change_dynfreq_status(bm_handle_t handle, int new_status);
+
+/**
+ * @name    bm_get_tpu_scalar_num
+ * @brief   To get the core number of TPU scalar
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle    The device handle
+ * @param [out] core_num The core number of TPU scalar
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_scalar_num(bm_handle_t handle, unsigned int *core_num);
+
+#define  bm_get_tpu_core_num bm_get_tpu_scalar_num
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* BM_RUNTIME_H_ */
diff --git a/models/Baichuan2/src/include/bmruntime_interface.h b/models/Baichuan2/src/include/bmruntime_interface.h
new file mode 100644
index 0000000..cbf6964
--- /dev/null
+++ b/models/Baichuan2/src/include/bmruntime_interface.h
@@ -0,0 +1,404 @@
+/*****************************************************************************
+ *
+ *    Copyright (c) 2016-2026 by Sophgo Technologies Inc. All rights reserved.
+ *
+ *    The material in this file is confidential and contains trade secrets
+ *    of Sophgo Technologies Inc. This is proprietary information owned by
+ *    Sophgo Technologies Inc. No part of this work may be disclosed,
+ *    reproduced, copied, transmitted, or used in any way for any purpose,
+ *    without the express written permission of Sophgo Technologies Inc.
+ *
+ *****************************************************************************/
+
+/*****************************************************************************
+ * BMRuntime Interface is mainly for inference.
+ * Also we can use it for device computation from BMLang programming.
+ * Note: please use interface from bmlib_runtime.h for device memory operation.
+ ****************************************************************************/
+
+#ifndef BMRUNTIME_INTERFACE_H_
+#define BMRUNTIME_INTERFACE_H_
+
+#include "bmdef.h"
+
+#ifdef _WIN32
+#define DECL_EXPORT _declspec(dllexport)
+#define DECL_IMPORT _declspec(dllimport)
+#else
+#define DECL_EXPORT
+#define DECL_IMPORT
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* --------------------------------------------------------------------------*/
+/* interface for basic data type */
+
+/* get data type byte size */
+DECL_EXPORT size_t bmrt_data_type_size(bm_data_type_t dtype);
+
+/*
+dims array to bm_shape_t,
+shape and dims should not be NULL, num_dims should not be larger than BM_MAX_DIMS_NUM */
+DECL_EXPORT void bmrt_shape(bm_shape_t* shape, const int* dims, int num_dims);
+
+/*
+number of shape elements, shape should not be NULL and num_dims should not large than
+BM_MAX_DIMS_NUM */
+DECL_EXPORT uint64_t bmrt_shape_count(const bm_shape_t* shape);
+
+/* compare whether two shape is same */
+DECL_EXPORT bool bmrt_shape_is_same(const bm_shape_t* left, const bm_shape_t* right);
+
+/*
+fill a tensor with data type and shape, and st_mode = 0 as default.
+tensor and p_bmrt should not be NULL, shape count should not be 0.
+it will alloc device mem to tensor->device_mem, so user should bmrt_free_device(p_bmrt,
+tensor->device_mem) to free it.*/
+DECL_EXPORT bool bmrt_tensor(bm_tensor_t* tensor, void* p_bmrt, bm_data_type_t dtype, bm_shape_t shape);
+
+/*
+fill a tensor with data type and shape, and st_mode = 0 as default.
+tensor and p_bmrt should not be NULL, shape count should not be 0.
+it will alloc device mem to tensor->device_mem on devid-th device.*/
+DECL_EXPORT bool bmrt_tensor_ex(bm_tensor_t* tensor, void* p_bmrt, int devid, bm_data_type_t dtype, bm_shape_t shape);
+
+/* fill a tensor with device mem existed, tensor byte size should not large than device mem size */
+DECL_EXPORT void bmrt_tensor_with_device(bm_tensor_t* tensor, bm_device_mem_t device_mem,
+                             bm_data_type_t dtype, bm_shape_t shape);
+
+/* get tensor bytes size, tensor should not be NULL */
+DECL_EXPORT size_t bmrt_tensor_bytesize(const bm_tensor_t* tensor);
+
+/* get tensor mem size allocated in device mem, tensor should not be NULL */
+DECL_EXPORT size_t bmrt_tensor_device_size(const bm_tensor_t* tensor);
+
+/* print net info for debug */
+DECL_EXPORT void bmrt_print_network_info(const bm_net_info_t* net_info);
+
+/* --------------------------------------------------------------------------*/
+/**
+ * @name    bmrt_create
+ * @brief   To create the bmruntime with bm_handle.
+ * @ingroup bmruntime
+ *
+ * This API creates the bmruntime. It returns a void* pointer which is the pointer
+ * of bmruntime. Device id is set when get bm_handle;
+ *
+ * @param [in] bm_handle     bm handle. It must be initialized by using bmlib.
+ *
+ * @retval void* the pointer of bmruntime
+ */
+DECL_EXPORT void* bmrt_create(bm_handle_t bm_handle);
+
+/* --------------------------------------------------------------------------*/
+/**
+ * @name    bmrt_create_ex
+ * @brief   To create the bmruntime with one or more bm_handle.
+ * @ingroup bmruntime
+ *
+ * This API creates the bmruntime. It returns a void* pointer which is the pointer
+ * of bmruntime.
+ *
+ * @param [in] bm_handles   bm handles. They must be initialized by using bmlib.
+ * @param [in] num_handles  number of bm_handles.
+ *
+ * @retval void* the pointer of bmruntime
+ */
+DECL_EXPORT void *bmrt_create_ex(bm_handle_t *bm_handles, int num_handles);
+
+/**
+ * @name    bmrt_destroy
+ * @brief   To destroy the bmruntime pointer
+ * @ingroup bmruntime
+ *
+ * This API destroy the bmruntime.
+ *
+ * @param [in]     p_bmrt        Bmruntime that had been created
+ */
+DECL_EXPORT void bmrt_destroy(void* p_bmrt);
+
+/**
+ * @name    bmrt_get_bm_handle
+ * @brief   To get the BM runtime context.
+ * @ingroup bmruntime
+ *
+ * This API get the BM runtime context for using BMDNN, BMCV or BMLIB
+ *
+ * @param [in]     p_bmrt        Bmruntime that had been created
+ */
+DECL_EXPORT void * bmrt_get_bm_handle(void* p_bmrt);
+
+/**
+ * @name    bmrt_load_bmodel
+ * @brief   To load the bmodel which is created by BM compiler
+ * @ingroup bmruntime
+ *
+ * This API is to load bmodel created by BM compiler.
+ * After loading bmodel, we can run the inference of neuron network.
+ *
+ * @param   [in]   p_bmrt        Bmruntime that had been created
+ * @param   [in]   bmodel_path   Bmodel file directory.
+ *
+ * @retval true    Load context sucess.
+ * @retval false   Load context failed.
+ */
+DECL_EXPORT bool bmrt_load_bmodel(void* p_bmrt, const char *bmodel_path);
+
+/**
+ * @name    bmrt_load_bmodel_data
+ * @brief   To load the bmodel which is created by BM compiler from buffer
+ * @ingroup bmruntime
+ *
+ * This API is to load bmodel created by BM compiler.
+ * After loading bmodel, we can run the inference of neuron network.
+ * Different with bmrt_load_bmodel, bmodel is the data in host memory.
+ *
+ * @param   [in]   p_bmrt        Bmruntime that had been created
+ * @param   [in]   bmodel_data   Bmodel data pointer to buffer
+ * @param   [in]   size          Bmodel data size
+ *
+ * @retval true    Load context sucess.
+ * @retval false   Load context failed.
+ */
+DECL_EXPORT bool bmrt_load_bmodel_data(void* p_bmrt, const void * bmodel_data, size_t size);
+
+/**
+ * @name    bmrt_show_neuron_network
+ * @brief   To print the name of all neuron network
+ * @ingroup bmruntime
+ *
+ * @param [in]     p_bmrt         Bmruntime that had been created
+ */
+DECL_EXPORT void bmrt_show_neuron_network(void* p_bmrt);
+
+/**
+ * @name    bmrt_get_network_number
+ * @brief   To get the number of neuron network in the bmruntime
+ * @ingroup bmruntime
+ *
+ * @param [in]     p_bmrt         Bmruntime that had been created
+ *
+ * @retval  int value     The number of neuron networks.
+ */
+DECL_EXPORT int bmrt_get_network_number(void* p_bmrt);
+
+/**
+ * @name    bmrt_get_network_names
+ * @brief   To get the names of all neuron network in the bmruntime
+ * @ingroup bmruntime
+ *
+ * @param [in]     p_bmrt         Bmruntime that had been created
+ * @param [out]    network_names  The names of all neuron networks. It should be declare as (const char** networks_ = NULL),
+ *                                and use as the param &networks_. After this API, user need to free(networks_) if user
+ *                                do not need it.
+ */
+DECL_EXPORT void bmrt_get_network_names(void* p_bmrt, const char*** network_names);
+
+/**
+ * @name    bmrt_get_network_info
+ * @brief   To get network info by net name
+ * @ingroup bmruntime
+ *
+ * @param [in]     p_bmrt         Bmruntime that had been created
+ * @param [in]     net_name       Network name
+ *
+ * @retval  bm_net_info_t*        Pointer to net info, needn't free by user; if net name not found, will return NULL.
+ */
+DECL_EXPORT const bm_net_info_t* bmrt_get_network_info(void* p_bmrt, const char* net_name);
+
+/**
+ * @name    bmrt_launch_tensor
+ * @brief   To launch the inference of the neuron network with setting input tensors
+ * @ingroup bmruntime
+ *
+ * This API supports the neuron nework that is static-compiled or dynamic-compiled
+ * After calling this API, inference on TPU is launched. And the CPU program will not
+ * be blocked. bm_thread_sync should be called to make sure inference finished.
+ * This API support multiple inputs, and multi thread safety
+ *
+ * @param [in]    p_bmrt         Bmruntime that had been created
+ * @param [in]    net_name       The name of the neuron network
+ * @param [in]    input_tensors  Array of input tensor, defined like bm_tensor_t input_tensors[input_num].
+ *                               User should initialize each input tensor.
+ * @param [in]    input_num      Input number
+ * @param [out]   output_tensors Array of output tensor, defined like bm_tensor_t output_tensors[output_num].
+ *                               This interface will alloc devcie mem to store output data. User should free each
+ *                               device mem by bm_free_device after the result data not used.
+ * @param [in]    output_num     Output number
+ *
+ * @retval true    Launch success.
+ * @retval false   Launch failed.
+ */
+DECL_EXPORT bool bmrt_launch_tensor(void* p_bmrt, const char * net_name, const bm_tensor_t input_tensors[], int input_num,
+                        bm_tensor_t output_tensors[], int output_num);
+
+/**
+ * @name    bmrt_launch_tensor_ex
+ * @brief   To launch the inference of the neuron network with setting input tensors
+ * @ingroup bmruntime
+ *
+ * This API supports the neuron nework that is static-compiled or dynamic-compiled
+ * After calling this API, inference on TPU is launched. And the CPU program will not
+ * be blocked. bm_thread_sync should be called to make sure inference finished.
+ * This API support multiple inputs, and multi thread safety
+ *
+ * @param [in]    p_bmrt            Bmruntime that had been created
+ * @param [in]    net_name          The name of the neuron network
+ * @param [in]    input_tensors     Array of input tensor, defined like bm_tensor_t input_tensors[input_num],
+ *                                  User should initialize each input tensor.
+ * @param [in]    input_num         Input number
+ * @param [out]   output_tensors    Array of output tensor, defined like bm_tensor_t output_tensors[output_num].
+ *                                  User can set device_mem or stmode of output tensors. If user_mem is true, this interface
+ *                                  will use device mem of output_tensors to store output data, and not alloc device mem;
+ *                                  Or it will alloc device mem to store output. If user_stmode is true, it will use stmode in
+ *                                  each output tensor; Or stmode will be BM_STORE_1N as default.
+ * @param [in]    output_num        Output number
+ * @param [in]    user_mem          whether device_mem of output tensors are set
+ * @param [in]    user_stmode       whether stmode of output tensors are set
+ *
+ * @retval true    Launch success.
+ * @retval false   Launch failed.
+ */
+DECL_EXPORT bool bmrt_launch_tensor_ex(void* p_bmrt, const char * net_name, const bm_tensor_t input_tensors[], int input_num,
+                           bm_tensor_t output_tensors[], int output_num, bool user_mem, bool user_stmode);
+
+/**
+ * @name    bmrt_launch_data
+ * @brief   To launch the inference of the neuron network with setting input datas in system memory
+ * @ingroup bmruntime
+ *
+ * This API supports the neuron nework that is static-compiled or dynamic-compiled
+ * After calling this API, inference on TPU is launched. And the CPU
+ * program will be blocked.
+ * This API support multiple inputs, and multi thread safety
+ *
+ * @param [in]    p_bmrt         Bmruntime that had been created
+ * @param [in]    net_name       The name of the neuron network
+ * @param [in]    input_datas    Array of input data, defined like void * input_datas[input_num]. User should
+ *                               initialize each data pointer as input.
+ * @param [in]    input_shapes   Array of input shape, defined like bm_shape_t input_shapes[input_num].
+ *                               User should set each input shape
+ * @param [in]    input_num      Input number
+ * @param [out]   output_datas   Array of output data, defined like void * output_datas[output_num].
+ *                               If user don't alloc each output data, set user_mem to false, and this api will alloc
+ *                               output mem, user should free each output mem when output data not used. Also
+ *                               user can alloc system memory for each output data by self and set user_mem = true.
+ * @param [out]   output_shapes  Array of output shape, defined like bm_shape_t output_shapes[output_num].
+ *                               It will store each output shape.
+ * @param [in]    output_num     Output number
+ * @param [in]    user_mem       whether output_datas[i] have allocated memory
+ *
+ * @retval true    Launch success.
+ * @retval false   Launch failed.
+ */
+DECL_EXPORT bool bmrt_launch_data(void* p_bmrt, const char* net_name, void* const input_datas[],
+                      const bm_shape_t input_shapes[], int input_num, void * output_datas[],
+                      bm_shape_t output_shapes[], int output_num, bool user_mem);
+
+/**
+ * @name    bmrt_trace
+ * @brief   To check runtime environment, and collect info for DEBUG
+ * @ingroup bmruntime
+ *
+ * This API is to collect runtime info for DEBUG. Expecially when launch result sudden mistake, call bmrt_trace
+ * will show whether device mems are broken, and other check info.
+ *
+ * @param [in]    p_bmrt         Bmruntime that had been created
+ */
+DECL_EXPORT void bmrt_trace(void* p_bmrt);
+
+/**
+ * @name    bmrt_launch_tensor_multi_cores
+ * @brief   To launch the inference of the neuron network with setting input tensors, and support multi core inference.
+ * @ingroup bmruntime
+ *
+ * This API supports the neuron nework that is static-compiled or dynamic-compiled
+ * After calling this API, inference on TPU is launched. And the CPU program will not
+ * be blocked. bm_thread_sync_from_core should be called to make sure inference is finished.
+ * This API support multiple inputs, and multi thread safety
+ *
+ * @param [in]    p_bmrt            Bmruntime that had been created
+ * @param [in]    net_name          The name of the neuron network
+ * @param [in]    input_tensors     Array of input tensor, defined like bm_tensor_t input_tensors[input_num],
+ *                                  User should initialize each input tensor.
+ * @param [in]    input_num         Input number
+ * @param [out]   output_tensors    Array of output tensor, defined like bm_tensor_t output_tensors[output_num].
+ *                                  User can set device_mem or stmode of output tensors. If user_mem is true, this interface
+ *                                  will use device mem of output_tensors to store output data, and not alloc device mem;
+ *                                  Or it will alloc device mem to store output. If user_stmode is true, it will use stmode in
+ *                                  each output tensor; Or stmode will be BM_STORE_1N as default.
+ * @param [in]    output_num        Output number
+ * @param [in]    user_mem          whether device_mem of output tensors are set
+ * @param [in]    user_stmode       whether stmode of output tensors are set
+ * @param [in]    core_list         core id list those will be used to inference
+ * @param [in]    core_num          number of the core list
+ *
+ * @retval true    Launch success.
+ * @retval false   Launch failed.
+ */
+DECL_EXPORT bool bmrt_launch_tensor_multi_cores(
+    void *p_bmrt,
+    const char *net_name,
+    const bm_tensor_t input_tensors[],
+    int input_num,
+    bm_tensor_t output_tensors[],
+    int output_num,
+    bool user_mem,
+    bool user_stmode,
+    const int *core_list,
+    int core_num);
+
+/**
+ *  @name    bmrt_memcpy_s2d_parallel
+ *  @brief   To copy data from system memory to muti-devices memory in parallel
+ *  @ingroup bmruntime
+ *
+ *  This API only could be used when the p_bmrt is created with bmrt_create_ex on multi devices.
+ *  After calling this API, datas[:tensor_num[0]] will be copied to the first device, and
+ *  datas[tensor_num[0]:tensor_num[0]+tensor_num[1]] will be copied to the second device and so on.
+ *  The process of copying data to different devices is done in parallel and to the same device is in sequence.
+ * 
+ *  @param [in]     p_bmrt      Bmruntime that had been created with multi bm_handles
+ *  @param [in]     tensors     Array of tensors that will be copied to devices
+ *  @param [in]     datas       Array of satas allocated in system memory
+ *  @param [in]     tensor_num  Array of tensor_num that will be copied to each device
+ *  @param [in]     device_num  Device number
+*/
+DECL_EXPORT bool bmrt_memcpy_s2d_parallel(
+    void *p_bmrt,
+    bm_tensor_t tensors[],
+    void *datas[],
+    int tensor_num[],
+    int device_num);
+
+/**
+ *  @name    bmrt_memcpy_d2s_parallel
+ *  @brief   To copy data from muti-devices memory to system memory in parallel
+ *  @ingroup bmruntime
+ *
+ *  This API only could be used when the p_bmrt is created with bmrt_create_ex on multi devices.
+ *  After calling this API, tensors on the first device will be copied to datas[:tensor_num[0]] , and
+ *  tensors on the second device will be copied to datas[tensor_num[0]:tensor_num[0]+tensor_num[1]] and so on.
+ *  The process of copying data from different devices is done in parallel and from the same device is in sequence.
+ * 
+ *  @param [in]     p_bmrt      Bmruntime that had been created with multi bm_handles
+ *  @param [in]     datas       Array of satas allocated in system memory
+ *  @param [in]     tensors     Array of tensors that will be copied from devices
+ *  @param [in]     tensor_num  Array of tensor_num that will be copied from each device
+ *  @param [in]     device_num  Device number
+*/
+DECL_EXPORT bool bmrt_memcpy_d2s_parallel(
+    void *p_bmrt,
+    void *datas[],
+    bm_tensor_t tensors[],
+    int tensor_num[],
+    int device_num);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif
diff --git a/models/Baichuan2/src/include/sentencepiece/sentencepiece_processor.h b/models/Baichuan2/src/include/sentencepiece/sentencepiece_processor.h
new file mode 100644
index 0000000..14b1e8c
--- /dev/null
+++ b/models/Baichuan2/src/include/sentencepiece/sentencepiece_processor.h
@@ -0,0 +1,727 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.!
+
+#ifndef SENTENCEPIECE_PROCESSOR_H_
+#define SENTENCEPIECE_PROCESSOR_H_
+
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#ifndef SWIG
+namespace absl {
+using std::string_view;
+}  // namespace absl
+#endif  // SWIG
+
+namespace sentencepiece {
+namespace util {
+
+enum class StatusCode : int {
+  kOk = 0,
+  kCancelled = 1,
+  kUnknown = 2,
+  kInvalidArgument = 3,
+  kDeadlineExceeded = 4,
+  kNotFound = 5,
+  kAlreadyExists = 6,
+  kPermissionDenied = 7,
+  kResourceExhausted = 8,
+  kFailedPrecondition = 9,
+  kAborted = 10,
+  kOutOfRange = 11,
+  kUnimplemented = 12,
+  kInternal = 13,
+  kUnavailable = 14,
+  kDataLoss = 15,
+  kUnauthenticated = 16,
+};
+
+class Status {
+ public:
+  Status();
+  ~Status();
+  Status(StatusCode code, absl::string_view error_message);
+  Status(const Status &s);
+  void operator=(const Status &s);
+  bool operator==(const Status &s) const;
+  bool operator!=(const Status &s) const;
+  inline bool ok() const { return rep_ == nullptr; }
+
+  void set_error_message(const char *str);
+  const char *error_message() const;
+  const char *message() const { return error_message(); }
+  StatusCode code() const;
+  std::string ToString() const;
+
+  void IgnoreError();
+
+ private:
+  struct Rep;
+  std::unique_ptr<Rep> rep_;
+};
+}  // namespace util
+
+// SentencePieceProcessor:
+// Simple and language independent tokenizer and de-tokenizer for
+// Neural Network Machine Translation.
+//
+// SentencePieceProcessor provides Encode() and Decode() methods,
+// which correspond to tokenization and de-tokenization respectively.
+//
+// - Encode:
+//   Given a raw source sentence, encode it into a sequence
+//   of pieces or vocabulary ids.
+//
+// - Decode:
+//   Given a sequence of pieces or vocabulary ids, decode it
+//   into a de-tokenized raw sentence.
+//
+// SentencePieceProcessor provides a lossless data conversion
+// that allows the original raw sentence to be perfectly reconstructed
+// from the encoded data, i.e., Decode(Encode(input)) == input.
+// This characteristics is useful, as we can make the de-tokenization
+// completely language independent.
+//
+// Usage:
+//   SentencePieceProcessor sp;
+//   sp.Load("//path/to/model");
+//
+//   vector<string> sps;
+//   sp.Encode("hello world.", &sps).IgnoreError();
+//
+//   vector<int> ids;
+//   sp.Encode("hello world.", &ids).IgnoreError();
+//
+//   string detok;
+//   sp.Decode(sps, &detok);
+//   CHECK_EQ("hello world.", detok).IgnoreError();
+//
+//   sp.Decode(ids, &detok);
+//   CHECK_EQ("hello world.", detok).IgnoreError();
+//
+//  We can also use SentencePieceText which manages the byte-offsets
+//  between user input (output) and internal sentence pieces.
+//
+//   SentencePieceText spt;
+//   sp.Encode("hello world.", &spt);
+//   // Emits the byte range of each piece.
+//   for (const auto &piece : spt.pieces()) {
+//      LOG(INFO) << piece.begin() << " " << piece.end();
+//   }
+//
+//   sp.Decode({0, 1, 2, 3..}, &spt);
+//   for (const auto &piece : spt.pieces()) {
+//      LOG(INFO) << piece.begin() << " " << piece.end();
+//   }
+//
+
+class NBestSentencePieceText;
+class ModelInterface;
+class SentencePieceText;
+class ModelProto;
+
+namespace normalizer {
+class Normalizer;
+}  // namespace normalizer
+
+#ifndef SWIGGO
+namespace util {
+// Redefine std::string for serialized_proto interface as Python's string is
+// a Unicode string. We can enforce the return value to be raw byte sequence
+// with SWIG's typemap.
+using bytes = std::string;
+}  // namespace util
+#endif  // SWIGGO
+
+class NBestSentencePieceText;
+class ModelInterface;
+class SentencePieceText;
+class SentencePieceText_SentencePiece;
+
+// Wrapper class of SentencePieceText
+// This wrapper only allows an immutable access to the proto and
+// hides the actual implementation of protobuf.
+// See sentencepiece.proto for the details of this class.
+class ImmutableSentencePieceText_ImmutableSentencePiece {
+ public:
+  ImmutableSentencePieceText_ImmutableSentencePiece();
+  ~ImmutableSentencePieceText_ImmutableSentencePiece() = default;
+
+  const std::string &piece() const;
+  const std::string &surface() const;
+  uint32_t id() const;
+  uint32_t begin() const;
+  uint32_t end() const;
+
+  friend class ImmutableSentencePieceText;
+
+ private:
+  explicit ImmutableSentencePieceText_ImmutableSentencePiece(
+      const SentencePieceText_SentencePiece &sp);
+  const SentencePieceText_SentencePiece *sp_ = nullptr;
+};
+
+class ImmutableSentencePieceText {
+ public:
+  ImmutableSentencePieceText();
+  virtual ~ImmutableSentencePieceText();
+
+  std::vector<ImmutableSentencePieceText_ImmutableSentencePiece> pieces() const;
+
+  size_t pieces_size() const;
+  ImmutableSentencePieceText_ImmutableSentencePiece pieces(int index) const;
+
+  const std::string &text() const;
+  float score() const;
+
+  util::bytes SerializeAsString() const;
+
+  // Returns the actual mutable proto.
+  // Do not use this outside of SentencePieceProcessor, as
+  // it returns the raw pointer managed by the shared_ptr.
+  SentencePieceText *mutable_proto();
+
+  // Converts the utf8 byte spans into Unicode char span.
+  void ConvertToUnicodeSpans();
+
+  friend class ImmutableNBestSentencePieceText;
+
+ private:
+  explicit ImmutableSentencePieceText(const SentencePieceText &spt);
+  const SentencePieceText *spt_ = nullptr;
+  std::shared_ptr<SentencePieceText> rep_;
+};
+
+// Wrapper class of SentencePieceText
+// This wrapper only allows an immutable access to the proto and
+// hides the actual implementation of protobuf.
+// See sentencepiece.proto for the details of this class.
+class ImmutableNBestSentencePieceText {
+ public:
+  ImmutableNBestSentencePieceText();
+  virtual ~ImmutableNBestSentencePieceText();
+
+  std::vector<ImmutableSentencePieceText> nbests() const;
+
+  size_t nbests_size() const;
+  ImmutableSentencePieceText nbests(int index) const;
+
+  util::bytes SerializeAsString() const;
+
+  // Returns the actual mutable proto.
+  // Do not use this outside of SentencePieceProcessor, as
+  // it returns the raw pointer managed by the shared_ptr.
+  NBestSentencePieceText *mutable_proto();
+
+  void ConvertToUnicodeSpans();
+
+ private:
+  std::shared_ptr<NBestSentencePieceText> rep_;
+};
+
+class SentencePieceProcessor {
+ public:
+  SentencePieceProcessor();
+  virtual ~SentencePieceProcessor();
+
+  // Loads model from `filename`.
+  // Returns false if `filename` cannot be loaded.
+  virtual util::Status Load(absl::string_view filename);
+
+  // Loads model from `filename`.
+  // Crash if `filename` cannot be loaded.
+  virtual void LoadOrDie(absl::string_view filename);
+
+  // Loads model from `model_proto`.
+  // `model_proto` is copied.
+  virtual util::Status Load(const ModelProto &model_proto);
+
+  // Loads model from `model_proto`.
+  // `model_proto` is moved.
+  virtual util::Status Load(std::unique_ptr<ModelProto> model_proto);
+
+  // Loads model from `serialized`, which is a string-serialized model proto.
+  // Useful to load the model from a platform independent blob object.
+  virtual util::Status LoadFromSerializedProto(absl::string_view serialized);
+
+  // Returns the status. Encode/Decode methods are valid when status is OK.
+  virtual util::Status status() const;
+
+  // Sets encode extra_option sequence.
+  virtual util::Status SetEncodeExtraOptions(absl::string_view extra_option);
+
+  // Sets decode extra_option sequence.
+  virtual util::Status SetDecodeExtraOptions(absl::string_view extra_option);
+
+  //////////////////////////////////////////////////////////////
+  // Vocabulary restriction.
+  // Background:
+  // https://github.com/rsennrich/subword-nmt#best-practice-advice-for-byte-pair-encoding-in-nmt
+
+  // Restricts the vocabulary set.
+  // The input sentences are encoded into the tokens in `valid_vocab`.
+  virtual util::Status SetVocabulary(
+      const std::vector<absl::string_view> &valid_vocab);
+
+  // Reverts the vocabulary restriction.
+  virtual util::Status ResetVocabulary();
+
+  // Loads the valid vocabulary set from `filename` in TSV format.
+  // Format:  <token> <tab> <freq>.
+  // Any token with frequency < threshold will be treated as OOV.
+  virtual util::Status LoadVocabulary(absl::string_view filename,
+                                      int threshold);
+
+  //////////////////////////////////////////////////////////////
+  // Simple Encode and Decode API.
+  //
+  // Given a UTF8 input, encodes it into a sequence of sentence pieces.
+  virtual util::Status Encode(absl::string_view input,
+                              std::vector<std::string> *pieces) const;
+
+  // Given a UTF8 input, encodes it into a sequence of ids.
+  virtual util::Status Encode(absl::string_view input,
+                              std::vector<int> *ids) const;
+
+  // Given a sequence of pieces, decodes it into a detokenized output.
+  virtual util::Status Decode(const std::vector<std::string> &pieces,
+                              std::string *detokenized) const;
+
+  // Given a sequence of pieces, decodes it into a detokenized output.
+  virtual util::Status Decode(const std::vector<absl::string_view> &pieces,
+                              std::string *detokenized) const;
+
+  // Given a sequence of ids, decodes it into a detokenized output.
+  virtual util::Status Decode(const std::vector<int> &ids,
+                              std::string *detokenized) const;
+
+  //////////////////////////////////////////////////////////////
+  // NBest API.
+  //
+  // Same as Encode, but returns nbest results.
+  virtual util::Status NBestEncode(
+      absl::string_view input, int nbest_size,
+      std::vector<std::vector<std::string>> *pieces) const;
+
+  // Same as Encode, but returns nbest results.
+  virtual util::Status NBestEncode(absl::string_view input, int nbest_size,
+                                   std::vector<std::vector<int>> *ids) const;
+
+  //////////////////////////////////////////////////////////////
+  // Sampling API.
+  //
+  // Unigram and BPE support sampling mode.
+  // - Unigram (--model_type=unigram):
+  // `nbest_size`: When `nbest_size` is positive value, approximately samples
+  // one segmentation from nbest candidates. When `nbest_size` is negative
+  // value, samples one segmentation from the hypotheses (Lattice) according to
+  // the generation probabilities using forward-filtering and backward-sampling
+  // algorithm.
+  // `alpha`: Smoothing parameter (inverse temperature). The best segmentation
+  // (Viterbi segmentation) is more likely sampled when setting larger alpha.
+  // When alpha is 0.0, one segmentation is uniformly sampled from the nbest or
+  // lattice. `nbest_size` and `alpha` correspond to parameters `l` and `alpha`
+  // in https://arxiv.org/abs/1804.10959  (nbest_size < 0 means l = infinity)
+  //
+  // - BPE (--model_type=bpe):
+  // `alpha`: The dropout probability `p` of bpe merge operations in
+  // https://arxiv.org/abs/1910.13267 Nbest-based sampling is not supported so
+  // nbest_size parameter is ignored in BPE.
+  virtual util::Status SampleEncode(absl::string_view input, int nbest_size,
+                                    float alpha,
+                                    std::vector<std::string> *pieces) const;
+
+  // Same as above, but returns a sequence of ids.
+  virtual util::Status SampleEncode(absl::string_view input, int nbest_size,
+                                    float alpha, std::vector<int> *ids) const;
+
+  //////////////////////////////////////////////////////////////
+  // SampleEncodeAndScore API.
+  //
+  // Sample `samples` many tokenisations from the segmentation lattice.
+  // These methods are only available in model_type=unigram.
+  //
+  // `alpha`: smoothing parameter (inverse temperature). The same as `alpha` in
+  // `Sample` method.
+  // 'wor`: If `wor` is true, the samples are taken without replacement, and the
+  // scores are the inclusion probabilities of the elements in the sample;
+  // otherwise the samples are taken with replacement and the scores are the
+  // log-probs of sample elements
+  // `include_best`: If `include_best` is true, the best tokenisation is always
+  // included in the sample, and the remaining elements are sampled excluding
+  // the best.
+  virtual util::Status SampleEncodeAndScore(
+      absl::string_view input, int num_samples, float alpha, bool wor,
+      bool include_best,
+      std::vector<std::pair<std::vector<std::string>, float>> *pieces) const;
+
+  // Same as above, but returns a sequence of ids.
+  virtual util::Status SampleEncodeAndScore(
+      absl::string_view input, int num_samples, float alpha, bool wor,
+      bool include_best,
+      std::vector<std::pair<std::vector<int>, float>> *ids) const;
+
+  //////////////////////////////////////////////////////////////
+  // Entropy API.
+  //
+  // This only available in model_type=unigram.
+  // Calculate entropy of possible tokenisations
+  virtual util::Status CalculateEntropy(absl::string_view input, float alpha,
+                                        float *entropy) const;
+
+  //////////////////////////////////////////////////////////////
+  // Advanced API returning SentencePieceText, which manages
+  // utf8-byte alignments between user-input/detokenized text
+  // and internal sentencepiece sequence.
+  //
+  // Given a UTF8 input, encodes it into SentencePieceText.
+  //
+  // When using these APIs, sentencepiece.pb.h header files must be included.
+  // We can also use ImutableSentencePieceText as follows.
+  //
+  // ImmutableSentencePieceText spt;
+  // Encode("hello", spt.mutable_proto()).IgnoreError();
+  // std::cout << spt.pieces_size() << std::endl;
+  virtual util::Status Encode(absl::string_view input,
+                              SentencePieceText *spt) const;
+
+  virtual util::Status NBestEncode(absl::string_view input, int nbest_size,
+                                   NBestSentencePieceText *nbest_spt) const;
+
+  virtual util::Status SampleEncode(absl::string_view input, int nbest_size,
+                                    float alpha, SentencePieceText *spt) const;
+
+  virtual util::Status SampleEncodeAndScore(
+      absl::string_view input, int num_samples, float alpha, bool wor,
+      bool include_best, NBestSentencePieceText *samples_spt) const;
+
+  // DEPRECATED: Remove this API and use std::vector<std::string_view>
+  virtual util::Status Decode(const std::vector<std::string> &pieces,
+                              SentencePieceText *spt) const;
+
+  virtual util::Status Decode(const std::vector<absl::string_view> &pieces,
+                              SentencePieceText *spt) const;
+
+  virtual util::Status Decode(const std::vector<int> &ids,
+                              SentencePieceText *spt) const;
+#ifdef SWIG
+#define SPP_SWIG_CHECK_AND_THROW \
+  if (!status.ok()) throw status;
+#else
+#define SPP_SWIG_CHECK_AND_THROW \
+  if (!status.ok()) {            \
+  }
+#endif  // SWIG
+
+#define DEFINE_SPP_DIRECT_FUNC_IMPL(FuncName, OutType, ...) \
+  OutType output;                                           \
+  const auto status = FuncName(__VA_ARGS__, &output);       \
+  SPP_SWIG_CHECK_AND_THROW;				    \
+  return output;
+
+#define DEFINE_SPP_SERIALIZED_PROTO_IMPL(FuncName, OutType, ...)     \
+  OutType output;                                                    \
+  const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \
+  SPP_SWIG_CHECK_AND_THROW;					     \
+  return output.SerializeAsString();
+
+#define DEFINE_SPP_IMMUTABLE_PROTO_IMPL(FuncName, OutType, ...)      \
+  OutType output;                                                    \
+  const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \
+  SPP_SWIG_CHECK_AND_THROW;					     \
+  return output;
+
+  //////////////////////////////////////////////////////////////
+  // Handy methods that return the result directly.
+  // These functions ignore internal errors.
+  virtual std::vector<std::string> EncodeAsPieces(
+      absl::string_view input) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(Encode, std::vector<std::string>, input);
+  }
+
+  virtual std::vector<int> EncodeAsIds(absl::string_view input) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(Encode, std::vector<int>, input);
+  }
+
+  virtual std::vector<std::vector<std::string>> NBestEncodeAsPieces(
+      absl::string_view input, int nbest_size) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(
+        NBestEncode, std::vector<std::vector<std::string>>, input, nbest_size);
+  }
+
+  virtual std::vector<std::vector<int>> NBestEncodeAsIds(
+      absl::string_view input, int nbest_size) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(NBestEncode, std::vector<std::vector<int>>,
+                                input, nbest_size);
+  }
+
+  virtual std::vector<std::string> SampleEncodeAsPieces(absl::string_view input,
+                                                        int nbest_size,
+                                                        float alpha) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncode, std::vector<std::string>, input,
+                                nbest_size, alpha);
+  }
+
+  virtual std::vector<int> SampleEncodeAsIds(absl::string_view input,
+                                             int nbest_size,
+                                             float alpha) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncode, std::vector<int>, input,
+                                nbest_size, alpha);
+  }
+
+  virtual std::vector<std::pair<std::vector<std::string>, float>>
+  SampleEncodeAndScoreAsPieces(absl::string_view input, int num_samples,
+                               float alpha, bool wor, bool include_best) const {
+    using _T = std::vector<std::pair<std::vector<std::string>, float>>;
+    DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncodeAndScore, _T, input, num_samples,
+                                alpha, wor, include_best);
+  }
+
+  virtual std::vector<std::pair<std::vector<int>, float>>
+  SampleEncodeAndScoreAsIds(absl::string_view input, int num_samples,
+                            float alpha, bool wor, bool include_best) const {
+    using _T = std::vector<std::pair<std::vector<int>, float>>;
+    DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncodeAndScore, _T, input, num_samples,
+                                alpha, wor, include_best);
+  }
+
+  // DEPRECATED: Remove this API and use std::vector<std::string_view>
+  virtual std::string DecodePieces(
+      const std::vector<std::string> &pieces) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, pieces);
+  }
+
+  virtual std::string DecodePieces(
+      const std::vector<absl::string_view> &pieces) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, pieces);
+  }
+
+  virtual std::string DecodeIds(const std::vector<int> &ids) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, ids);
+  }
+
+  virtual float CalculateEntropy(absl::string_view text, float alpha) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(CalculateEntropy, float, text, alpha);
+  }
+
+  //////////////////////////////////////////////////////////////
+  // SerializedProto API. (DEPRECATED). Use ImmutableProto API.
+  // They are used in Python interface. Returns serialized proto.
+  // In python module, we can get access to the full Proto after
+  // deserialzing the returned byte sequence.
+  virtual util::bytes EncodeAsSerializedProto(absl::string_view input) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(Encode, ImmutableSentencePieceText, input);
+  }
+
+  virtual util::bytes SampleEncodeAsSerializedProto(absl::string_view input,
+                                                    int nbest_size,
+                                                    float alpha) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(SampleEncode, ImmutableSentencePieceText,
+                                     input, nbest_size, alpha);
+  }
+
+  virtual util::bytes NBestEncodeAsSerializedProto(absl::string_view input,
+                                                   int nbest_size) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(
+        NBestEncode, ImmutableNBestSentencePieceText, input, nbest_size);
+  }
+
+  virtual util::bytes SampleEncodeAndScoreAsSerializedProto(
+      absl::string_view input, int num_samples, float alpha, bool wor,
+      bool include_best) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(SampleEncodeAndScore,
+                                     ImmutableNBestSentencePieceText, input,
+                                     num_samples, alpha, wor, include_best);
+  }
+
+  // TODO(taku): Remove this API and use std::vector<std::string_view>
+  virtual util::bytes DecodePiecesAsSerializedProto(
+      const std::vector<std::string> &pieces) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(Decode, ImmutableSentencePieceText,
+                                     pieces);
+  }
+
+  virtual util::bytes DecodePiecesAsSerializedProto(
+      const std::vector<absl::string_view> &pieces) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(Decode, ImmutableSentencePieceText,
+                                     pieces);
+  }
+
+  virtual util::bytes DecodeIdsAsSerializedProto(
+      const std::vector<int> &ids) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(Decode, ImmutableSentencePieceText, ids);
+  }
+
+  //////////////////////////////////////////////////////////////
+  // ImmutableProto API.
+  virtual ImmutableSentencePieceText EncodeAsImmutableProto(
+      absl::string_view input) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Encode, ImmutableSentencePieceText, input);
+  }
+
+  virtual ImmutableSentencePieceText SampleEncodeAsImmutableProto(
+      absl::string_view input, int nbest_size, float alpha) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(SampleEncode, ImmutableSentencePieceText,
+                                    input, nbest_size, alpha);
+  }
+
+  virtual ImmutableNBestSentencePieceText NBestEncodeAsImmutableProto(
+      absl::string_view input, int nbest_size) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(
+        NBestEncode, ImmutableNBestSentencePieceText, input, nbest_size);
+  }
+
+  virtual ImmutableNBestSentencePieceText SampleEncodeAndScoreAsImmutableProto(
+      absl::string_view input, int num_samples, float alpha, bool wor,
+      bool include_best) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(SampleEncodeAndScore,
+                                    ImmutableNBestSentencePieceText, input,
+                                    num_samples, alpha, wor, include_best);
+  }
+
+  // TODO(taku): Remove this API and use std::vector<std::string_view>
+  virtual ImmutableSentencePieceText DecodePiecesAsImmutableProto(
+      const std::vector<std::string> &pieces) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Decode, ImmutableSentencePieceText, pieces);
+  }
+
+  virtual ImmutableSentencePieceText DecodePiecesAsImmutableProto(
+      const std::vector<absl::string_view> &pieces) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Decode, ImmutableSentencePieceText, pieces);
+  }
+
+  virtual ImmutableSentencePieceText DecodeIdsAsImmutableProto(
+      const std::vector<int> &ids) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Decode, ImmutableSentencePieceText, ids);
+  }
+
+#undef DEFINE_SPP_DIRECT_FUNC_IMPL
+#undef DEFINE_SPP_SERIALIZED_PROTO_IMPL
+#undef DEFINE_SPP_IMMUTABLE_PROTO_IMPL
+
+  //////////////////////////////////////////////////////////////
+  // Vocabulary management methods.
+  //
+  // Returns the size of sentence pieces, which is the same as
+  // the size of vocabulary for NMT.
+  virtual int GetPieceSize() const;
+
+  // Returns the vocab id of `piece`.
+  // Returns UNK(0) if `piece` is unknown.
+  virtual int PieceToId(absl::string_view piece) const;
+
+  // Returns the string representation of vocab with `id`.
+  virtual const std::string &IdToPiece(int id) const;
+
+  // Returns the score of `id`.
+  // Usually score is an emission log probability of unigram language
+  // model.
+  virtual float GetScore(int id) const;
+
+  // Returns true if `id` is unknown symbol.
+  virtual bool IsUnknown(int id) const;
+
+  // Returns true if `id` is control symbol.
+  virtual bool IsControl(int id) const;
+
+  // Returns true if `id` is unused symbol.
+  virtual bool IsUnused(int id) const;
+
+  // Returns true if `id` is byte symbol.
+  virtual bool IsByte(int id) const;
+
+  // Returns the reserved id.
+  // Returns -1 if not defined.
+
+  // Returns unknown (<unk>) id.
+  virtual int unk_id() const;
+
+  // Returns BOS (<s>) id.
+  virtual int bos_id() const;
+
+  // Returns EOS (</s>) id.
+  virtual int eos_id() const;
+
+  // Returns PAD (<pad>) id.
+  virtual int pad_id() const;
+
+  //////////////////////////////////////////////////////////////
+  // Model management.
+  //
+  // Allows injection of a mock model instance. `model` is moved.
+  void SetModel(std::unique_ptr<ModelInterface> &&model);
+
+  // Allows injection of a normalizer instance. `normalizer` is moved.
+  void SetNormalizer(std::unique_ptr<normalizer::Normalizer> &&normalizer);
+
+  // Returns immutable model proto. Useful to obtain extended
+  // or experimental parameters encoded in model_proto.
+  const ModelProto &model_proto() const;
+
+  // returns immutable model proto as std::string.
+  // Useful to save the state of this instance via Python's pickle object.
+  util::bytes serialized_model_proto() const;
+
+ private:
+  enum ExtraOption { REVERSE, BOS, EOS, UNK_PIECE };
+
+  util::Status ParseExtraOptions(absl::string_view extra_option,
+                                 std::vector<ExtraOption> *extra_options) const;
+
+  util::Status ApplyExtraOptions(const std::vector<ExtraOption> &extra_options,
+                                 SentencePieceText *spt) const;
+
+  util::Status PopulateSentencePieceText(
+      absl::string_view input, absl::string_view normalized,
+      const std::vector<size_t> &norm_to_orig,
+      const std::vector<std::pair<absl::string_view, int>> &result,
+      SentencePieceText *spt) const;
+
+  std::unique_ptr<ModelInterface> model_;
+  std::unique_ptr<normalizer::Normalizer> normalizer_;
+  std::unique_ptr<normalizer::Normalizer> denormalizer_;
+
+  // Underlying model protocol buffer. The same lifetime as model_.
+  std::unique_ptr<ModelProto> model_proto_;
+
+  std::vector<ExtraOption> encode_extra_options_;
+  std::vector<ExtraOption> decode_extra_options_;
+};
+
+// Set seed value of random generator.
+// Do not set static_cast<unique_int>(-1),
+// as this seed is reserved for initializing from
+// std::random_device.
+void SetRandomGeneratorSeed(unsigned int seed);
+
+// IO related functions to absorb model formats.
+namespace io {
+// Loads `model_proto` from `filename`.
+// We can instantiate SentencePieceProcessor as follows:
+//
+//  auto model_proto = absl::make_unique<ModelProto>();
+//  io::LoadModelProto("//path/spm.model", model_proto.get());
+//  SentencePieceProcessor sp;
+//  CHECK_OK(sp.Load(std::move(model_proto)));
+util::Status LoadModelProto(absl::string_view, ModelProto *model_proto);
+
+// Saves `model_proto` as `filename`.
+util::Status SaveModelProto(absl::string_view, const ModelProto &model_proto);
+}  // namespace io
+}  // namespace sentencepiece
+#endif  // SENTENCEPIECE_PROCESSOR_H_
diff --git a/models/Baichuan2/src/lib_pcie/libbmlib.so b/models/Baichuan2/src/lib_pcie/libbmlib.so
new file mode 100644
index 0000000..7f9a95f
Binary files /dev/null and b/models/Baichuan2/src/lib_pcie/libbmlib.so differ
diff --git a/models/Baichuan2/src/lib_pcie/libbmrt.so b/models/Baichuan2/src/lib_pcie/libbmrt.so
new file mode 100644
index 0000000..137929f
Binary files /dev/null and b/models/Baichuan2/src/lib_pcie/libbmrt.so differ
diff --git a/models/Baichuan2/src/lib_pcie/libbmrt.so.1.0 b/models/Baichuan2/src/lib_pcie/libbmrt.so.1.0
new file mode 100644
index 0000000..137929f
Binary files /dev/null and b/models/Baichuan2/src/lib_pcie/libbmrt.so.1.0 differ
diff --git a/models/Baichuan2/src/lib_pcie/libsentencepiece.a b/models/Baichuan2/src/lib_pcie/libsentencepiece.a
new file mode 100644
index 0000000..7c17fa2
Binary files /dev/null and b/models/Baichuan2/src/lib_pcie/libsentencepiece.a differ
diff --git a/models/Baichuan2/src/lib_soc/libbmlib.so b/models/Baichuan2/src/lib_soc/libbmlib.so
new file mode 100644
index 0000000..81c75c1
Binary files /dev/null and b/models/Baichuan2/src/lib_soc/libbmlib.so differ
diff --git a/models/Baichuan2/src/lib_soc/libbmrt.so b/models/Baichuan2/src/lib_soc/libbmrt.so
new file mode 100644
index 0000000..d182777
Binary files /dev/null and b/models/Baichuan2/src/lib_soc/libbmrt.so differ
diff --git a/models/Baichuan2/src/lib_soc/libbmrt.so.1.0 b/models/Baichuan2/src/lib_soc/libbmrt.so.1.0
new file mode 100644
index 0000000..d182777
Binary files /dev/null and b/models/Baichuan2/src/lib_soc/libbmrt.so.1.0 differ
diff --git a/models/Baichuan2/src/lib_soc/libsentencepiece.a b/models/Baichuan2/src/lib_soc/libsentencepiece.a
new file mode 100644
index 0000000..39debcd
Binary files /dev/null and b/models/Baichuan2/src/lib_soc/libsentencepiece.a differ
diff --git a/models/Baichuan2/src/tokenizer.model b/models/Baichuan2/src/tokenizer.model
new file mode 100644
index 0000000..4348df0
Binary files /dev/null and b/models/Baichuan2/src/tokenizer.model differ
diff --git a/models/Baichuan2/web_demo/CMakeLists.txt b/models/Baichuan2/web_demo/CMakeLists.txt
new file mode 100755
index 0000000..119534c
--- /dev/null
+++ b/models/Baichuan2/web_demo/CMakeLists.txt
@@ -0,0 +1,36 @@
+cmake_minimum_required(VERSION 2.8)
+project(baichuan2)
+
+if (NOT DEFINED TARGET_ARCH)
+    set(TARGET_ARCH pcie)
+endif()
+
+set(CMAKE_INSTALL_PREFIX install)
+
+if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "aarch64")
+	add_definitions(-DSOC_TARGET)
+	link_directories(${PROJECT_SOURCE_DIR}/../src/lib_soc)
+	message("SoC mode, starting......")
+elseif (${TARGET_ARCH} STREQUAL "pcie")
+    add_definitions(-DPCIE_TARGET)
+    link_directories(${PROJECT_SOURCE_DIR}/../src/lib_pcie)
+	message("Pcie mode, starting......")
+elseif (${TARGET_ARCH} STREQUAL "soc")
+    add_definitions(-DSOC_TARGET)
+    set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+    set(CMAKE_ASM_COMPILER aarch64-linux-gnu-gcc)
+    set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+    link_directories(${PROJECT_SOURCE_DIR}/lib_soc)
+	message("SoC mode, starting......")
+endif()
+
+
+
+
+include_directories(${PROJECT_SOURCE_DIR}/../src/include)
+
+add_definitions(-DDEBUG --std=c++17 -fPIC -Wall -Werror)
+set(CMAKE_BUILD_TYPE "Debug")
+
+add_library(tpuchat SHARED chat.cpp)
+target_link_libraries(tpuchat bmrt bmlib sentencepiece)
diff --git a/models/Baichuan2/web_demo/chat.cpp b/models/Baichuan2/web_demo/chat.cpp
new file mode 100755
index 0000000..84724c6
--- /dev/null
+++ b/models/Baichuan2/web_demo/chat.cpp
@@ -0,0 +1,419 @@
+//===----------------------------------------------------------------------===//
+//
+// Copyright (C) 2023 Sophgo Technologies Inc.  All rights reserved.
+//
+// TPU-MLIR is licensed under the 2-Clause BSD License except for the
+// third-party components.
+//
+//===----------------------------------------------------------------------===//
+
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+#include <assert.h>
+#include <chrono>
+#include <algorithm>
+#include "memory.h"
+#include "sentencepiece/sentencepiece_processor.h"
+#include "bmruntime_interface.h"
+#include <getopt.h>
+
+static const int NUM_LAYERS = 32;
+static const int MAX_LEN = 512;
+static const float ATTENTION_MASK = -1000.;
+
+static const std::string TOKENIZER_MODEL = "tokenizer.model";
+
+// #define EXPORT_RESULTS
+#ifdef EXPORT_RESULTS
+#include "cnpy.h"
+static cnpy::npz_t map;
+
+template <typename T>
+static void add_array(std::string name, bm_handle_t bm_handle,
+                      const bm_device_mem_t &dst) {
+  std::vector<T> data(dst.size / sizeof(T));
+  bm_memcpy_d2s(bm_handle, data.data(), dst);
+  cnpy::npz_add_array(map, name, data);
+}
+
+static void save_array(std::string filename) {
+  cnpy::npz_save_all(filename, map);
+}
+#endif
+
+class Baichuan2 {
+public:
+  void init(int devid, const std::string model, const std::string tokenizer_path);
+  void chat();
+  void deinit();
+  std::string name;
+  std::string history = "";
+  int round = 0;
+  int token_length;
+  int EOS;
+  std::string predict_next_token();
+  std::string predict_first_token(const std::string &input_str);
+
+private:
+  int forward_first(std::vector<int> &tokens);
+  int forward_next();
+  void load_sentencepiece(const std::string &tokenizer_path);
+
+private:
+  std::vector<bm_handle_t> handles;
+  bm_handle_t bm_handle;
+  void *p_bmrt;
+  sentencepiece::SentencePieceProcessor sentencepiece;
+  const bm_net_info_t *net_blocks[NUM_LAYERS];
+  const bm_net_info_t *net_blocks_cache[NUM_LAYERS];
+  const bm_net_info_t *net_embed;
+  const bm_net_info_t *net_lm;
+  bm_tensor_t inputs_embed_512, outputs_embed_512;
+  bm_tensor_t inputs_lm, outputs_lm;
+  bm_tensor_t inputs_pid, next_pid, inputs_attention, next_attention;
+  bm_tensor_t past_key[NUM_LAYERS], past_value[NUM_LAYERS];
+  bm_tensor_t present_key[NUM_LAYERS], present_value[NUM_LAYERS];
+  bm_tensor_t present_key_cache, present_value_cache;
+  std::string name_embed;
+  std::string name_lm;
+  std::string name_blocks[NUM_LAYERS];
+  std::string name_blocks_cache[NUM_LAYERS];
+};
+
+void Baichuan2::load_sentencepiece(const std::string &model) {
+  printf("Load %s ... ", model.c_str());
+  auto status = sentencepiece.Load(model);
+  if (!status.ok()) {
+    std::cout << status.ToString() << std::endl;
+    exit(-1);
+  }
+  EOS = sentencepiece.eos_id();
+  printf("Done!\n");
+}
+
+void Baichuan2::init(int devid, const std::string model, const std::string tokenizer_path) {
+  load_sentencepiece(tokenizer_path);
+  // request bm_handle
+  bm_status_t status = bm_dev_request(&bm_handle, devid);
+  assert(BM_SUCCESS == status);
+
+  // create bmruntime
+  p_bmrt = bmrt_create(bm_handle);
+  assert(NULL != p_bmrt);
+
+  // load bmodel by file
+  printf("Model[%s] loading ....\n", model.c_str());
+  bool ret = bmrt_load_bmodel(p_bmrt, model.c_str());
+  assert(true == ret);
+  printf("Done!\n");
+  // net names
+  name_embed = "embedding";
+  name_lm = "lm_head";
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    name_blocks[i] = "block_" + std::to_string(i);
+    name_blocks_cache[i] = "block_cache_" + std::to_string(i);
+  }
+
+  // net infos
+  net_embed = bmrt_get_network_info(p_bmrt, name_embed.c_str());
+  net_lm = bmrt_get_network_info(p_bmrt, name_lm.c_str());
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    net_blocks[i] = bmrt_get_network_info(p_bmrt, name_blocks[i].c_str());
+    net_blocks_cache[i] =
+        bmrt_get_network_info(p_bmrt, name_blocks_cache[i].c_str());
+  }
+
+  // net device mem
+  ret = bmrt_tensor(&inputs_embed_512, p_bmrt, net_embed->input_dtypes[0],
+                    net_embed->stages[1].input_shapes[0]);
+  assert(true == ret);
+
+  ret = bmrt_tensor(&outputs_embed_512, p_bmrt, net_embed->output_dtypes[0],
+                    net_embed->stages[1].output_shapes[0]);
+  assert(true == ret);
+
+  ret = bmrt_tensor(&inputs_pid, p_bmrt, net_blocks[0]->input_dtypes[1],
+                    net_blocks[0]->stages[0].input_shapes[1]);
+  assert(true == ret);
+
+  ret = bmrt_tensor(&inputs_attention, p_bmrt, net_blocks[0]->input_dtypes[2],
+                    net_blocks[0]->stages[0].input_shapes[2]);
+  assert(true == ret);
+
+  ret = bmrt_tensor(&next_pid, p_bmrt, net_blocks_cache[0]->input_dtypes[1],
+                    net_blocks_cache[0]->stages[0].input_shapes[1]);
+  assert(true == ret);
+
+  ret =
+      bmrt_tensor(&next_attention, p_bmrt, net_blocks_cache[0]->input_dtypes[2],
+                  net_blocks_cache[0]->stages[0].input_shapes[2]);
+  assert(true == ret);
+
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    ret = bmrt_tensor(&past_key[i], p_bmrt, net_blocks[0]->output_dtypes[1],
+                      net_blocks[0]->stages[0].output_shapes[1]);
+    assert(true == ret);
+    ret = bmrt_tensor(&past_value[i], p_bmrt, net_blocks[0]->output_dtypes[2],
+                      net_blocks[0]->stages[0].output_shapes[2]);
+    assert(true == ret);
+    ret = bmrt_tensor(&present_key[i], p_bmrt, net_blocks[0]->output_dtypes[1],
+                      net_blocks[0]->stages[0].output_shapes[1]);
+    assert(true == ret);
+    ret = bmrt_tensor(&present_value[i], p_bmrt, net_blocks[0]->output_dtypes[2],
+                      net_blocks[0]->stages[0].output_shapes[2]);
+    assert(true == ret);
+  }
+  ret = bmrt_tensor(&present_key_cache, p_bmrt, net_blocks_cache[0]->output_dtypes[1],
+                    net_blocks_cache[0]->stages[0].output_shapes[1]);
+  assert(true == ret);
+  ret = bmrt_tensor(&present_value_cache, p_bmrt, net_blocks_cache[0]->output_dtypes[2],
+                    net_blocks_cache[0]->stages[0].output_shapes[2]);
+  assert(true == ret);
+
+  ret = bmrt_tensor(&inputs_lm, p_bmrt, net_lm->input_dtypes[0],
+                    net_lm->stages[0].input_shapes[0]);
+  assert(true == ret);
+  ret = bmrt_tensor(&outputs_lm, p_bmrt, net_lm->output_dtypes[0],
+                    net_lm->stages[0].output_shapes[0]);
+  assert(true == ret);
+}
+
+void Baichuan2::deinit() {
+  bm_free_device(bm_handle, inputs_embed_512.device_mem);
+  bm_free_device(bm_handle, outputs_embed_512.device_mem);
+  bm_free_device(bm_handle, inputs_lm.device_mem);
+  bm_free_device(bm_handle, outputs_lm.device_mem);
+  bm_free_device(bm_handle, inputs_pid.device_mem);
+  bm_free_device(bm_handle, next_pid.device_mem);
+  bm_free_device(bm_handle, inputs_attention.device_mem);
+  bm_free_device(bm_handle, next_attention.device_mem);
+  bm_free_device(bm_handle, present_key_cache.device_mem);
+  bm_free_device(bm_handle, present_value_cache.device_mem);
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    bm_free_device(bm_handle, past_key[i].device_mem);
+    bm_free_device(bm_handle, past_value[i].device_mem);
+    bm_free_device(bm_handle, present_key[i].device_mem);
+    bm_free_device(bm_handle, present_value[i].device_mem);
+  }
+  bmrt_destroy(p_bmrt);
+  for (auto h : handles) {
+    bm_dev_free(h);
+  }
+}
+
+
+
+int Baichuan2::forward_first(std::vector<int> &tokens) {
+  int input_ids[MAX_LEN] = {0}; // start token
+  int position_id[MAX_LEN] = {0};
+  float attention_mask[MAX_LEN * MAX_LEN] = {0};
+  token_length = tokens.size();
+  
+  std::copy(tokens.begin(), tokens.end(), input_ids);
+  for (int i = 0; i < token_length; i++) {
+    position_id[i] = i;
+  }
+
+  for (int i = 0; i < MAX_LEN; i++) {
+    for (int j = 0; j < MAX_LEN; j++) {
+      if (j <= i && i < token_length) {
+      } else {
+        attention_mask[i * MAX_LEN + j] = ATTENTION_MASK;
+      }
+    }
+  }
+
+  // forward embeding
+  bm_memcpy_s2d(bm_handle, inputs_embed_512.device_mem, (void *)input_ids);
+  auto ret =
+      bmrt_launch_tensor_ex(p_bmrt, name_embed.c_str(), &inputs_embed_512, 1,
+                            &outputs_embed_512, 1, true, false);
+  assert(ret);
+  bm_thread_sync(bm_handle);
+
+  // forward blocks
+  bm_memcpy_s2d(bm_handle, inputs_pid.device_mem, (void *)position_id);
+  bm_memcpy_s2d(bm_handle, inputs_attention.device_mem, (void *)attention_mask);
+  auto inputs_embed = outputs_embed_512;
+  inputs_embed.shape = net_blocks[0]->stages[0].input_shapes[0];
+  bm_tensor_t inputs_block[3] = {inputs_embed, inputs_pid, inputs_attention};
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    bm_tensor_t outputs_block[3] = {inputs_embed, past_key[i], past_value[i]};
+    ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks[i].c_str(), inputs_block, 3,
+                                outputs_block, 3, true, false);
+    assert(ret);
+    bm_thread_sync(bm_handle);
+  }
+  int bytes = inputs_embed.device_mem.size / MAX_LEN;
+  bm_memcpy_d2d_byte(bm_handle, inputs_lm.device_mem, 0,
+                     inputs_embed.device_mem, (token_length - 1) * bytes,
+                     bytes);
+  ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm, 1,
+                              &outputs_lm, 1, true, false);
+  bm_thread_sync(bm_handle);
+  
+  int token = 0;
+  bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm.device_mem);
+  return token;
+}
+
+int Baichuan2::forward_next() {
+  float attention_mask[MAX_LEN + 1] = {0};
+  for (int i = token_length - 1; i < MAX_LEN; i++) {
+    attention_mask[i] = ATTENTION_MASK;
+  }
+  int32_t position_id = token_length - 1;
+  // embedding
+  outputs_lm.shape = net_embed->stages[0].input_shapes[0];
+  auto ret = bmrt_launch_tensor_ex(p_bmrt, name_embed.c_str(), &outputs_lm, 1,
+                                   &inputs_lm, 1, true, false);
+  assert(ret);
+  bm_thread_sync(bm_handle);
+
+  // blocks
+  bm_memcpy_s2d(bm_handle, next_attention.device_mem, (void *)attention_mask);
+  bm_memcpy_s2d(bm_handle, next_pid.device_mem, (void *)&position_id);
+  auto inputs_embed = inputs_lm;
+  inputs_embed.shape = net_blocks_cache[0]->stages[0].input_shapes[0];
+  int bytes = bm_mem_get_device_size(present_key_cache.device_mem);
+  int token_offset = (token_length - 1) * bytes;
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    bm_tensor_t inputs_block[5] = {inputs_embed, next_pid, next_attention,
+                                   past_key[i], past_value[i]};
+    bm_tensor_t outputs_block[3] = {inputs_embed, present_key_cache, present_value_cache};
+    ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks_cache[i].c_str(),
+                                inputs_block, 5, outputs_block, 3, true, false);
+    assert(ret);
+    bm_thread_sync(bm_handle);
+    bm_memcpy_d2d_byte(bm_handle, past_key[i].device_mem, token_offset,
+                       present_key_cache.device_mem, 0,
+                       bytes);
+    bm_memcpy_d2d_byte(bm_handle, past_value[i].device_mem, token_offset,
+                       present_value_cache.device_mem, 0,
+                       bytes);
+  }
+  outputs_lm.shape = net_lm->stages[0].output_shapes[0];
+  ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm, 1,
+                              &outputs_lm, 1, true, false);
+  bm_thread_sync(bm_handle);
+
+  int token = 0;
+  bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm.device_mem);
+  return token;
+}
+
+
+std::string Baichuan2::predict_first_token(const std::string &input_str) {
+  history = input_str;
+  //int tok_num = 1;
+  std::vector<int> tokens;
+  sentencepiece.Encode(history, &tokens);
+  tokens.insert(tokens.begin(), 1);
+  if (tokens.empty()) {
+    round = 0;
+    history = "Sorry: your question is too wierd!!\n";
+    return history;
+  }
+  // make sure token not too large
+  if (tokens.size() > MAX_LEN - 10) {
+    // reset
+    if (round == 0) {
+      history = "Error: your question is too large!\n";
+      return history;
+    }
+    round = 0;
+    history = "";
+    return predict_first_token(input_str);
+  }
+  int token = forward_first(tokens);
+  int pre_token = 0;
+  std::string pre_word;
+  std::string word;
+  std::vector<int> pre_ids = {pre_token};
+  std::vector<int> ids = {pre_token,token};
+  sentencepiece.Decode(pre_ids, &pre_word);
+  sentencepiece.Decode(ids, &word);
+  std::string diff = word.substr(pre_word.size());
+#ifdef PRINT
+  printf("token %d",token);
+  printf("diff %s",diff.c_str());
+#endif
+  history += diff;
+  if (token_length < MAX_LEN) {
+    token_length++;
+  }
+  return diff;
+}
+
+std::string Baichuan2::predict_next_token() {
+  int pre_token;
+  pre_token = 0;
+  int token = forward_next();
+  if(token == EOS){
+    round = 0;
+    history = history.substr(history.size()/2);
+    return "_GETEOS_";
+  }
+  std::string pre_word;
+  std::string word;
+  std::vector<int> pre_ids = {pre_token};
+  std::vector<int> ids = {pre_token, token};
+  sentencepiece.Decode(pre_ids, &pre_word);
+  sentencepiece.Decode(ids, &word);
+  std::string diff = word.substr(pre_word.size());
+#ifdef PRINT
+  printf("token %d",token);
+  printf("diff %s",diff.c_str());
+#endif
+  history += diff;
+  if (token_length < MAX_LEN) {
+    token_length++;
+  }else{
+    round = 0;
+    return "_GETMAX_";
+  }
+  return diff;
+}
+
+
+extern "C" {
+
+
+Baichuan2 *Baichuan2_with_devid_and_model(int devid, const char *bmodel_path, const char *tokenizer_path) {
+  Baichuan2 *chat = new Baichuan2();
+  chat->init(devid, bmodel_path, tokenizer_path);
+  return chat;
+}
+
+void Baichuan2_delete(Baichuan2 *chat) { delete chat; }
+
+void Baichuan2_deinit(Baichuan2 *chat) { 
+  chat->deinit();
+}
+
+const char *get_history(Baichuan2 *chat) {
+  std::string str = chat->history;
+  return strdup(str.c_str());
+}
+
+const char *set_history(Baichuan2 *chat, const char *history) {
+  chat->history = history;
+  return strdup(history);
+}
+
+const char *Baichuan2_predict_first_token(Baichuan2 *chat, const char *input_str) {
+  std::string str = chat->predict_first_token(input_str);
+  return strdup(str.c_str());
+}
+
+const char *Baichuan2_predict_next_token(Baichuan2 *chat) {
+  std::string str = chat->predict_next_token();
+  return strdup(str.c_str());
+}
+
+const int get_eos(Baichuan2 *chat){
+  const int res = chat->EOS;
+  return res;
+}
+}
diff --git a/models/Baichuan2/web_demo/chat.py b/models/Baichuan2/web_demo/chat.py
new file mode 100755
index 0000000..804bce8
--- /dev/null
+++ b/models/Baichuan2/web_demo/chat.py
@@ -0,0 +1,97 @@
+# coding=utf-8
+
+import ctypes
+
+
+class TokenWord(ctypes.Structure):
+    _fields_ = [
+        ("token", ctypes.c_int),
+        ("word", ctypes.c_char * 2048)  # 假设最大长度为 100，你可以根据实际情况调整
+    ]
+
+
+class TPUChatglm:
+    def __init__(self):
+        self.lib = ctypes.cdll.LoadLibrary('./build/libtpuchat.so')
+        device_id = 3
+        bmodel_path = "../model/baichuan2-7b-test_int8.bmodel"
+        token_path = "../model/tokenizer.model"
+        self.device_id = device_id
+        self.bmodel_path = bmodel_path
+        self.token_path = token_path
+        self.libset()
+        self.init()
+
+    def libset(self):
+        self.lib.Baichuan2_with_devid_and_model.argtypes = [ctypes.c_int, ctypes.c_char_p, ctypes.c_char_p]
+        self.lib.Baichuan2_with_devid_and_model.restype = ctypes.c_void_p
+
+        self.lib.Baichuan2_delete.argtypes = [ctypes.c_void_p]
+
+        # deinit
+        self.lib.Baichuan2_deinit.argtypes = [ctypes.c_void_p]
+
+        # Baichuan2_predict_first_token
+        self.lib.Baichuan2_predict_first_token.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+        self.lib.Baichuan2_predict_first_token.restype = ctypes.c_char_p
+
+        # Baichuan2_predict_next_token
+        self.lib.Baichuan2_predict_next_token.argtypes = [ctypes.c_void_p]
+        self.lib.Baichuan2_predict_next_token.restype = ctypes.c_char_p
+
+        # get_eos
+        self.lib.get_eos.argtypes = [ctypes.c_void_p]
+        self.lib.get_eos.restype = ctypes.c_int
+        # get_history
+        self.lib.get_history.argtypes = [ctypes.c_void_p]
+        self.lib.get_history.restype = ctypes.c_char_p
+        # set history
+        self.lib.set_history.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+
+    def init(self):
+        self.obj = self.lib.Baichuan2_with_devid_and_model(self.device_id, self.bmodel_path.encode('utf-8'),
+                                                          self.token_path.encode('utf-8'))
+
+    def predict_first_token(self, context):
+        return self.lib.Baichuan2_predict_first_token(self.obj, context.encode('utf-8')).decode('utf-8')
+
+    def predict_next_token(self):
+        return self.lib.Baichuan2_predict_next_token(self.obj).decode('utf-8')
+
+    def predict(self, context):
+
+        first_token = self.predict_first_token(context)
+        # print(first_token, end='')
+        res = ''
+        while True:
+            next_token = self.predict_next_token()
+            if next_token == '_GETMAX_' or next_token == '_GETEOS_':
+                # print(next_token)
+                break
+            # print(next_token, end='')
+            res += next_token
+        return res
+
+    def stream_predict(self, query, history):
+        history.append((query, ''))
+
+        prompt = ''
+        # for i, (old_query, response) in enumerate(history):
+        #     prompt += "[Round {}]\n\n问：{}\n\n答：{}\n\n".format(i + 1, old_query, response)
+        # prompt += "[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
+        prompt = "<reserved_106>" + query + "<reserved_107>"
+        
+        res = ''
+        first_token = self.predict_first_token(prompt)
+        res += first_token
+
+        while True:
+            next_token = self.predict_next_token()
+            if next_token == '_GETMAX_' or next_token == '_GETEOS_':
+                break
+            res += next_token
+            history[-1] = (query, res)
+            yield res, history
+
+    def get_config(self):
+        pass
\ No newline at end of file
diff --git a/models/Baichuan2/web_demo/web_demo.py b/models/Baichuan2/web_demo/web_demo.py
new file mode 100755
index 0000000..1dc5ee2
--- /dev/null
+++ b/models/Baichuan2/web_demo/web_demo.py
@@ -0,0 +1,108 @@
+import time
+import gradio as gr
+import mdtex2html
+from chat import TPUChatglm
+
+
+def postprocess(self, y):
+    if y is None:
+        return []
+    for i, (message, response) in enumerate(y):
+        y[i] = (
+            None if message is None else mdtex2html.convert((message)),
+            None if response is None else mdtex2html.convert(response),
+        )
+    return y
+
+
+gr.Chatbot.postprocess = postprocess
+
+glm = TPUChatglm()
+
+def parse_text(text):
+    """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/"""
+    lines = text.split("\n")
+    lines = [line for line in lines if line != ""]
+    count = 0
+    for i, line in enumerate(lines):
+        if "```" in line:
+            count += 1
+            items = line.split('`')
+            if count % 2 == 1:
+                lines[i] = f'<pre><code class="language-{items[-1]}">'
+            else:
+                lines[i] = f'<br></code></pre>'
+        else:
+            if i > 0:
+                if count % 2 == 1:
+                    line = line.replace("`", "\`")
+                    line = line.replace("<", "&lt;")
+                    line = line.replace(">", "&gt;")
+                    line = line.replace(" ", "&nbsp;")
+                    line = line.replace("*", "&ast;")
+                    line = line.replace("_", "&lowbar;")
+                    line = line.replace("-", "&#45;")
+                    line = line.replace(".", "&#46;")
+                    line = line.replace("!", "&#33;")
+                    line = line.replace("(", "&#40;")
+                    line = line.replace(")", "&#41;")
+                    line = line.replace("$", "&#36;")
+                lines[i] = "<br>" + line
+    text = "".join(lines)
+    return text
+
+
+def gen(input, history):
+    i = 0
+    history.append((input, ''))
+    res = ''
+    while i < 10:
+        i += 1
+        res += str(i)
+        time.sleep(0.05)
+        history[-1] = (input, res)
+        yield res, history
+
+
+def predict(input, chatbot, max_length, top_p, temperature, history):
+
+    chatbot.append((parse_text(input), ""))
+    for response, history in glm.stream_predict(input, history):
+        chatbot[-1] = (parse_text(input), parse_text(response))
+        yield chatbot, history
+
+
+def reset_user_input():
+    return gr.update(value='')
+
+
+def reset_state():
+    return [], [], None
+
+
+with gr.Blocks() as demo:
+    gr.HTML("""<h1 align="center">Baichuan2-7B TPU</h1>""")
+
+    chatbot = gr.Chatbot()
+    with gr.Row():
+        with gr.Column(scale=4):
+            with gr.Column(scale=12):
+                user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10).style(
+                    container=False)
+            with gr.Column(min_width=32, scale=1):
+                submitBtn = gr.Button("Submit", variant="primary")
+        with gr.Column(scale=1):
+            emptyBtn = gr.Button("Clear History")
+            max_length = gr.Slider(0, 32768, value=8192, step=1.0, label="Maximum length", interactive=True)
+            top_p = gr.Slider(0, 1, value=0.8, step=0.01, label="Top P", interactive=True)
+            temperature = gr.Slider(0, 1, value=0.95, step=0.01, label="Temperature", interactive=True)
+
+    history = gr.State([])
+
+    submitBtn.click(predict, [user_input, chatbot, max_length, top_p, temperature, history],
+                    [chatbot, history], show_progress=True)
+    submitBtn.click(reset_user_input, [], [user_input])
+
+    emptyBtn.click(reset_state, outputs=[chatbot, history], show_progress=True)
+
+demo.queue().launch(share=True, server_name="0.0.0.0", inbrowser=True)
\ No newline at end of file
diff --git a/models/ChatGLM3/compile/compile.sh b/models/ChatGLM3/compile/compile.sh
index d1fe154..6fd6028 100755
--- a/models/ChatGLM3/compile/compile.sh
+++ b/models/ChatGLM3/compile/compile.sh
@@ -96,6 +96,7 @@ model_deploy.py \
     --quant_input \
     --quant_output \
     --chip bm1684x \
+    $device_args \
     --model embedding_cache.bmodel
 
 rm *.npz
@@ -112,7 +113,7 @@ pushd $outdir
 
 model_transform.py \
     --model_name lm_head \
-    --model_def ../../lm_head.onnx \
+    --model_def ../../onnx/lm_head.onnx \
     --mlir lm_head.mlir
 
 model_deploy.py \
@@ -141,24 +142,28 @@ for ((i=0; i<=$num_layers; i++)); do
 
     model_transform.py \
         --model_name block_$i \
-        --model_def ../../block_$i.onnx \
+        --model_def ../../onnx/block_$i.onnx \
         --mlir block_$i.mlir
 
     model_deploy.py \
         --mlir block_$i.mlir \
         $quantize_args \
+        --quant_input \
+        --quant_output \
         --chip bm1684x \
         $device_args \
         --model block_$i.bmodel
 
     model_transform.py \
         --model_name block_cache_$i \
-        --model_def ../../block_cache_$i.onnx \
+        --model_def ../../onnx/block_cache_$i.onnx \
         --mlir block_cache_$i.mlir
 
     model_deploy.py \
         --mlir block_cache_$i.mlir \
         $quantize_args \
+        --quant_input \
+        --quant_output \
         --chip bm1684x \
         $device_args \
         --model block_cache_$i.bmodel
diff --git a/models/ChatGLM3/compile/export_onnx.py b/models/ChatGLM3/compile/export_onnx.py
index 56cb542..a23d33b 100755
--- a/models/ChatGLM3/compile/export_onnx.py
+++ b/models/ChatGLM3/compile/export_onnx.py
@@ -141,7 +141,8 @@ def convert_block_cache(layer_id):
 
 def convert_embedding():
     model = Embedding()
-    torch.onnx.export(model, (torch.tensor([0, 1, 2, 3])),
+    input = torch.tensor([range(SEQ_LENGTH)])
+    torch.onnx.export(model, (input),
                       f'{folder}/embedding.onnx',
                       verbose=False,
                       input_names=['input_ids'],
diff --git a/models/ChatGLM3/compile/files/chatglm3-6b/modeling_chatglm.py b/models/ChatGLM3/compile/files/chatglm3-6b/modeling_chatglm.py
index 163d634..a970776 100755
--- a/models/ChatGLM3/compile/files/chatglm3-6b/modeling_chatglm.py
+++ b/models/ChatGLM3/compile/files/chatglm3-6b/modeling_chatglm.py
@@ -278,7 +278,7 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask):
                 attention_mask.tril_()
                 attention_mask = ~attention_mask
             if attention_mask is not None:
-                attention_scores = attention_scores + (attention_mask * -10000.0)
+                attention_scores = attention_scores + attention_mask
                 #attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
             attention_probs = F.softmax(attention_scores, dim=-1)
             attention_probs = attention_probs.type_as(value_layer)
diff --git a/models/ChatGLM3/demo/CMakeLists.txt b/models/ChatGLM3/demo/CMakeLists.txt
index a9c250b..e135a49 100755
--- a/models/ChatGLM3/demo/CMakeLists.txt
+++ b/models/ChatGLM3/demo/CMakeLists.txt
@@ -1,26 +1,28 @@
 cmake_minimum_required(VERSION 2.8)
 project(chatglm)
 
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON CACHE INTERNAL "") 
+
 if (NOT DEFINED TARGET_ARCH)
     set(TARGET_ARCH pcie)
 endif()
 
-include_directories(${PROJECT_SOURCE_DIR}/../src/include)
+include_directories(${PROJECT_SOURCE_DIR}/../support/include)
 
 if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "aarch64")
 	add_definitions(-DSOC_TARGET)
-	link_directories(${PROJECT_SOURCE_DIR}/../src/lib_soc)
+	link_directories(${PROJECT_SOURCE_DIR}/../support/lib_soc)
 	message("SoC mode, starting......")
 elseif (${TARGET_ARCH} STREQUAL "pcie")
     add_definitions(-DPCIE_TARGET)
-    link_directories(${PROJECT_SOURCE_DIR}/../src/lib_pcie)
+    link_directories(${PROJECT_SOURCE_DIR}/../support/lib_pcie)
 	message("PCIE mode, starting......")
 elseif (${TARGET_ARCH} STREQUAL "soc")
     add_definitions(-DSOC_TARGET)
     set(CMAKE_C_COMPILER /opt/aarch64-linux-gnu-7.5.0/bin/aarch64-linux-gnu-gcc)
     set(CMAKE_ASM_COMPILER /opt/aarch64-linux-gnu-7.5.0/bin/aarch64-linux-gnu-gcc)
     set(CMAKE_CXX_COMPILER /opt/aarch64-linux-gnu-7.5.0/bin/aarch64-linux-gnu-g++)
-    link_directories(${PROJECT_SOURCE_DIR}/../src/lib_soc)
+    link_directories(${PROJECT_SOURCE_DIR}/../support/lib_soc)
 	message("SoC mode, starting......")
 endif()
 
@@ -28,4 +30,9 @@ add_definitions(-DDEBUG --std=c++17 -fPIC -Wall -Werror)
 set(CMAKE_BUILD_TYPE "Debug")
 
 add_executable(chatglm demo.cpp)
-target_link_libraries(chatglm bmlib bmrt sentencepiece)
+target_link_libraries(chatglm bmrt bmlib sentencepiece)
+
+if (${TARGET_ARCH} STREQUAL "pcie")
+    add_executable(chatglm_parallel demo_parallel.cpp)
+    target_link_libraries(chatglm_parallel bmrt bmlib sentencepiece)
+endif()
\ No newline at end of file
diff --git a/models/ChatGLM3/demo/demo_parallel.cpp b/models/ChatGLM3/demo/demo_parallel.cpp
new file mode 100755
index 0000000..671a0f7
--- /dev/null
+++ b/models/ChatGLM3/demo/demo_parallel.cpp
@@ -0,0 +1,615 @@
+//===----------------------------------------------------------------------===//
+//
+// Copyright (C) 2023 Sophgo Technologies Inc.  All rights reserved.
+//
+// TPU-MLIR is licensed under the 2-Clause BSD License except for the
+// third-party components.
+//
+//===----------------------------------------------------------------------===//
+
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+#include <assert.h>
+#include <chrono>
+#include <algorithm>
+#include "memory.h"
+#include "sentencepiece/sentencepiece_processor.h"
+#include "bmruntime_interface.h"
+#include <getopt.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+static const uint16_t ATTENTION_MASK = 0xF0E2;
+
+class ChatGLM {
+public:
+  void init(const std::vector<int> &devid, std::string model_path, std::string tokenizer_path);
+  void chat();
+  void deinit();
+
+private:
+  void answer(const std::string &input_str);
+  void tokenizer_encode(const std::string &input_str, std::vector<int> &tokens);
+  int forward_first(std::vector<int> &tokens);
+  int forward_next(int cur_token);
+  void move2end(const bm_tensor_t &kv);
+  void load_sentencepiece(std::string tokenizer_path);
+  void build_system_prompt();
+
+private:
+  std::vector<bm_handle_t> handles;
+  bm_handle_t bm_handle;
+  void *p_bmrt;
+  sentencepiece::SentencePieceProcessor sentencepiece;
+  const bm_net_info_t *net_embed;
+  const bm_net_info_t *net_embed_cache;
+  const bm_net_info_t *net_lm;
+  std::vector<const bm_net_info_t *> net_blocks;
+  std::vector<const bm_net_info_t *> net_blocks_cache;
+  std::vector<bm_tensor_t> inputs_embed_512, outputs_embed_512;
+  std::vector<bm_tensor_t> inputs_pid, next_pid, inputs_attention, next_attention;
+  std::vector<std::vector<bm_tensor_t>> past_key, past_value;
+  std::vector<bm_tensor_t> inputs_lm, outputs_lm;
+  std::string name_embed;
+  std::string name_embed_cache;
+  std::string name_lm;
+  std::vector<std::string> name_blocks;
+  std::vector<std::string> name_blocks_cache;
+  std::string system_string =
+      "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow "
+      "the user's instructions carefully. Respond using markdown.";
+  std::vector<int> history_tokens;
+  std::vector<int> head_prompt{64790, 64792, 64794, 30910,
+                               13}; // head + system id + \n
+  std::vector<int> system_prompt;
+
+  int device_num;
+  int round = 0;
+  int token_length;
+  int EOS;
+  int SEQLEN;
+  int NUM_LAYERS;
+};
+
+void ChatGLM::load_sentencepiece(std::string tokenizer_path) {
+  printf("Load %s ... ", tokenizer_path.c_str());
+  auto status = sentencepiece.Load(tokenizer_path);
+  if (!status.ok()) {
+    std::cout << status.ToString() << std::endl;
+    exit(-1);
+  }
+  EOS = sentencepiece.eos_id();
+  printf("Done!\n");
+}
+
+void ChatGLM::init(const std::vector<int> &devices, std::string model_path, std::string tokenizer_path) {
+  device_num = devices.size();
+  load_sentencepiece(tokenizer_path);
+  // request bm_handle
+  std::cout << "Device [ ";
+  for (auto d : devices) {
+    std::cout << d << " ";
+  }
+  std::cout << "] loading ....\n";
+  for (auto d : devices) {
+    bm_handle_t h;
+    bm_status_t status = bm_dev_request(&h, d);
+    assert(BM_SUCCESS == status);
+    handles.push_back(h);
+  }
+  bm_handle = handles[0];
+
+  // decode system prompt
+  sentencepiece.Encode(system_string, &system_prompt);
+
+  // create bmruntime
+#ifdef SOC_TARGET
+  p_bmrt = bmrt_create(handles[0]);
+#else
+  p_bmrt = bmrt_create_ex(handles.data(), handles.size());
+#endif
+  assert(NULL != p_bmrt);
+
+  // load bmodel by file
+  printf("Model[%s] loading ....\n", model_path.c_str());
+  bool ret = bmrt_load_bmodel(p_bmrt, model_path.c_str());
+  assert(true == ret);
+  printf("Done!\n");
+
+  // set NUM_LAYERS
+  auto num_nets = bmrt_get_network_number(p_bmrt);
+  NUM_LAYERS = (num_nets - 2) / 2;
+
+  // net names
+  name_embed = "embedding";
+  name_embed_cache = "embedding_cache";
+  name_lm = "lm_head";
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    name_blocks.emplace_back("block_" + std::to_string(i));
+    name_blocks_cache.emplace_back("block_cache_" + std::to_string(i));
+  }
+
+  // net infos
+  net_embed = bmrt_get_network_info(p_bmrt, name_embed.c_str());
+  net_embed_cache = bmrt_get_network_info(p_bmrt, name_embed_cache.c_str());
+  net_lm = bmrt_get_network_info(p_bmrt, name_lm.c_str());
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    net_blocks.emplace_back(
+        bmrt_get_network_info(p_bmrt, name_blocks[i].c_str()));
+    net_blocks_cache.emplace_back(
+        bmrt_get_network_info(p_bmrt, name_blocks_cache[i].c_str()));
+  }
+
+  // set SEQLEN
+  SEQLEN = net_embed->stages[0].input_shapes[0].dims[1];
+
+  // resize
+  net_blocks.resize(NUM_LAYERS);
+  net_blocks_cache.resize(NUM_LAYERS);
+  past_key.resize(NUM_LAYERS);
+  past_value.resize(NUM_LAYERS);
+
+  // net device mem
+  inputs_embed_512.resize(net_embed->input_num);
+  for (int i = 0; i < device_num; ++i) {
+    ret = bmrt_tensor_ex(&inputs_embed_512[i], p_bmrt,
+                        net_embed->input_loc_devices[i],
+                        net_embed->input_dtypes[i],
+                        net_embed->stages[0].input_shapes[i]);
+    assert(true == ret);
+  }
+
+  outputs_embed_512.resize(net_embed->output_num);
+  for (int i = 0; i < device_num; ++i) {
+    ret = bmrt_tensor_ex(&outputs_embed_512[i], p_bmrt,
+                        net_embed->output_loc_devices[i],
+                        net_embed->output_dtypes[i],
+                        net_embed->stages[0].output_shapes[i]);
+    assert(true == ret);
+  }
+
+  inputs_pid.resize(device_num);
+  inputs_attention.resize(device_num);
+  int in_num = net_blocks[0]->input_num / device_num;
+  for (int i = 0; i < device_num; ++i) {
+    ret = bmrt_tensor_ex(&inputs_pid[i], p_bmrt,
+                        net_blocks[0]->input_loc_devices[1 + i * in_num],
+                        net_blocks[0]->input_dtypes[1 + i * in_num],
+                        net_blocks[0]->stages[0].input_shapes[1 + i * in_num]);
+    assert(true == ret);
+
+    ret = bmrt_tensor_ex(&inputs_attention[i], p_bmrt,
+                        net_blocks[0]->input_loc_devices[2 + i * in_num],
+                        net_blocks[0]->input_dtypes[2 + i * in_num],
+                        net_blocks[0]->stages[0].input_shapes[2 + i * in_num]);
+    assert(true == ret);
+  }
+
+
+  next_pid.resize(device_num);
+  next_attention.resize(device_num);
+  int in_num_cache = net_blocks_cache[0]->input_num / device_num;
+  for (int i = 0; i < device_num; ++i) {
+    ret = bmrt_tensor_ex(&next_pid[i], p_bmrt,
+                        net_blocks_cache[0]->input_loc_devices[1 + i * in_num_cache],
+                        net_blocks_cache[0]->input_dtypes[1 + i * in_num_cache],
+                        net_blocks_cache[0]->stages[0].input_shapes[1 + i * in_num_cache]);
+    assert(true == ret);
+
+    ret = bmrt_tensor_ex(&next_attention[i], p_bmrt,
+                        net_blocks_cache[0]->input_loc_devices[2 + i * in_num_cache],
+                        net_blocks_cache[0]->input_dtypes[2 + i * in_num_cache],
+                        net_blocks_cache[0]->stages[0].input_shapes[2 + i * in_num_cache]);
+    assert(true == ret);
+  }
+
+  int out_num = net_blocks[0]->output_num / device_num;
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    past_key[i].resize(device_num);
+    past_value[i].resize(device_num);
+    for (int j = 0; j < device_num; j++) {
+      ret = bmrt_tensor_ex(&past_key[i][j], p_bmrt,
+                          net_blocks[0]->output_loc_devices[1 + j * out_num],
+                          net_blocks[0]->output_dtypes[1 + j * out_num],
+                          net_blocks[0]->stages[0].output_shapes[1 + j * out_num]);
+      assert(true == ret);
+      ret = bmrt_tensor_ex(&past_value[i][j], p_bmrt,
+                          net_blocks[0]->output_loc_devices[2 + j * out_num],
+                          net_blocks[0]->output_dtypes[2 + j * out_num],
+                          net_blocks[0]->stages[0].output_shapes[2 + j * out_num]);
+      assert(true == ret);
+    }
+  }
+
+  inputs_lm.resize(device_num);
+  outputs_lm.resize(device_num);
+  for (int i = 0; i < device_num; ++i) {
+    ret = bmrt_tensor_ex(&inputs_lm[i], p_bmrt, i, net_lm->input_dtypes[0],
+                        net_lm->stages[0].input_shapes[0]);
+    assert(true == ret);
+    ret = bmrt_tensor_ex(&outputs_lm[i], p_bmrt, i, net_lm->output_dtypes[0],
+                        net_lm->stages[0].output_shapes[0]);
+    assert(true == ret);
+  }
+}
+
+void ChatGLM::deinit() {
+  for (int i = 0; i < device_num; ++i) {
+    bm_free_device(handles[i], inputs_embed_512[i].device_mem);
+    bm_free_device(handles[i], outputs_embed_512[i].device_mem);
+    bm_free_device(handles[i], inputs_pid[i].device_mem);
+    bm_free_device(handles[i], next_pid[i].device_mem);
+    bm_free_device(handles[i], inputs_attention[i].device_mem);
+    bm_free_device(handles[i], next_attention[i].device_mem);
+    bm_free_device(handles[i], inputs_lm[i].device_mem);
+    bm_free_device(handles[i], outputs_lm[i].device_mem);
+  }
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    for (int j = 0; j < device_num; j++) {
+      bm_free_device(handles[j], past_key[i][j].device_mem);
+      bm_free_device(handles[j], past_value[i][j].device_mem);
+    }
+  }
+  bmrt_destroy(p_bmrt);
+  for (auto h : handles) {
+    bm_dev_free(h);
+  }
+}
+
+// after first block, move real result to end of mem
+void ChatGLM::move2end(const bm_tensor_t &kv) {
+  if (token_length >= SEQLEN) {
+    return;
+  }
+  auto total_size = bm_mem_get_device_size(kv.device_mem);
+  auto bytes = total_size / SEQLEN;
+  auto real_size = token_length * bytes;
+  auto mem =
+      bm_mem_from_device(bm_mem_get_device_addr(kv.device_mem), real_size);
+  auto buffer = new uint8_t[real_size];
+  auto dst = new uint8_t[total_size];
+  bm_memcpy_d2s(bm_handle, (void *)buffer, mem);
+  memset(dst, 0, total_size - real_size);
+  memcpy(dst + total_size - real_size, buffer, real_size);
+  bm_memcpy_s2d(bm_handle, kv.device_mem, (void *)dst);
+  delete[] buffer;
+  delete[] dst;
+}
+
+int ChatGLM::forward_first(std::vector<int> &tokens) {
+  std::vector<int> input_ids(SEQLEN, 0);
+  std::vector<int> position_id(SEQLEN, 0);
+  std::vector<uint16_t> attention_mask(SEQLEN * SEQLEN, 0);
+
+  input_ids[0] = 64790;
+  input_ids[1] = 64792;
+  std::copy(tokens.begin(), tokens.end(), input_ids.data() + 2);
+
+  token_length = tokens.size() + 2;
+  for (int i = 0; i < token_length; i++) {
+    position_id[i] = i;
+  }
+  for (int i = 0; i < SEQLEN; i++) {
+    for (int j = 0; j < SEQLEN; j++) {
+      if (j <= i && i < token_length) {
+      } else {
+        attention_mask[i * SEQLEN + j] = ATTENTION_MASK;
+      }
+    }
+  }
+
+  // forward embeding
+  std::vector<int> input_nums(device_num, 1);
+  std::vector<void*> datas(device_num, (void*)input_ids.data());
+  bmrt_memcpy_s2d_parallel(p_bmrt, inputs_embed_512.data(), datas.data(),
+                          input_nums.data(), device_num);
+  auto ret =
+      bmrt_launch_tensor_ex(p_bmrt, name_embed.c_str(),
+                            inputs_embed_512.data(), inputs_embed_512.size(),
+                            outputs_embed_512.data(), outputs_embed_512.size(),
+                            true, false);
+  assert(ret);
+  bm_thread_sync(bm_handle);
+
+  // forward blocks
+  std::vector<void*> pos_id_datas(device_num, position_id.data());
+  std::vector<void*> in_attn_datas(device_num, attention_mask.data());
+  bmrt_memcpy_s2d_parallel(p_bmrt, inputs_pid.data(), pos_id_datas.data(),
+                          input_nums.data(), device_num);
+  bmrt_memcpy_s2d_parallel(p_bmrt, inputs_attention.data(),in_attn_datas.data(),
+                          input_nums.data(), device_num);
+  auto embed_512 = outputs_embed_512;
+  std::vector<bm_tensor_t> inputs_block;
+  std::vector<bm_tensor_t> outputs_block;
+  for (int i = 0; i < device_num; ++i) {
+    embed_512[i].shape = net_blocks[0]->stages[0].input_shapes[0];
+    inputs_block.push_back(embed_512[i]);
+    inputs_block.push_back(inputs_pid[i]);
+    inputs_block.push_back(inputs_attention[i]);
+    outputs_block.push_back(embed_512[i]);
+    outputs_block.push_back(past_key[0][i]);
+    outputs_block.push_back(past_value[0][i]);
+  }
+
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    for (int j = 0; j < device_num; ++j) {
+      outputs_block[1 + j * 3] = past_key[i][j];
+      outputs_block[2 + j * 3] = past_value[i][j];
+    }
+    ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks[i].c_str(),
+                                inputs_block.data(), inputs_block.size(),
+                                outputs_block.data(), outputs_block.size(),
+                                true, false);
+    assert(ret);
+    bm_thread_sync(bm_handle);
+    for (int j = 0; j < device_num; ++j) {
+      move2end(past_key[i][j]);
+      move2end(past_value[i][j]);
+    }
+  }
+
+  // forward lmhead
+  int bytes = embed_512[0].device_mem.size / SEQLEN;
+  bm_memcpy_d2d_byte(bm_handle, inputs_lm[0].device_mem, 0,
+                     embed_512[0].device_mem, (token_length - 1) * bytes,
+                     bytes);
+  ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm[0], 1,
+                              &outputs_lm[0], 1, true, false);
+  bm_thread_sync(bm_handle);
+
+  int token = 0;
+  bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm[0].device_mem);
+  return token;
+}
+
+int ChatGLM::forward_next(int cur_token) {
+  std::vector<uint16_t> attention_mask(SEQLEN + 1, 0);
+  for (int i = 0; i <= SEQLEN - token_length; i++) {
+    attention_mask[i] = ATTENTION_MASK;
+  }
+  int32_t position_id = token_length - 1;
+
+  // forward embedding
+  std::vector<bm_tensor_t> inputs_embed;
+  std::vector<void*> input_datas;
+  std::vector<int> input_nums(device_num, 1);
+  for (int i = 0; i < device_num; ++i) {
+    inputs_embed.push_back(outputs_lm[i]); // token_id
+    inputs_embed[i].shape = net_embed_cache->stages[0].input_shapes[0];
+    input_datas.push_back((void*)(&cur_token));
+  }
+  bmrt_memcpy_s2d_parallel(p_bmrt, inputs_embed.data(), input_datas.data(),
+                          input_nums.data(), device_num);
+  auto ret = bmrt_launch_tensor_ex(p_bmrt, name_embed_cache.c_str(),
+                                  inputs_embed.data(), inputs_embed.size(),
+                                  inputs_lm.data(), inputs_lm.size(), true, false);
+  assert(ret);
+  bm_thread_sync(bm_handle);
+
+  // forward blocks
+  std::vector<void*> attn_datas(device_num, attention_mask.data());
+  std::vector<void*> pid_datas(device_num, &position_id);
+  bmrt_memcpy_s2d_parallel(p_bmrt, next_attention.data(), attn_datas.data(),
+                          input_nums.data(), device_num);
+  bmrt_memcpy_s2d_parallel(p_bmrt, next_pid.data(), pid_datas.data(),
+                          input_nums.data(), device_num);
+                          
+  // WARNING: make inputs_lm device_num                   
+  std::vector<bm_tensor_t> embed_1 = inputs_lm;
+  for (int i = 0; i < device_num; ++i) {
+    embed_1[i].shape = net_blocks_cache[0]->stages[0].input_shapes[0];
+  }
+  std::vector<bm_tensor_t> inputs_block;
+  std::vector<bm_tensor_t> outputs_block;
+  for (int i = 0; i < device_num; ++i) {
+    inputs_block.push_back(embed_1[i]);
+    inputs_block.push_back(next_pid[i]);
+    inputs_block.push_back(next_attention[i]);
+    inputs_block.push_back(past_key[0][i]);
+    inputs_block.push_back(past_value[0][i]);
+    outputs_block.push_back(embed_1[i]);
+    outputs_block.push_back(past_key[0][i]);
+    outputs_block.push_back(past_value[0][i]);
+  }
+
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    for (int j = 0; j < device_num; ++j) {
+      inputs_block[3 + j * 5] = past_key[i][j];
+      inputs_block[4 + j * 5] = past_value[i][j];
+      outputs_block[1 + j * 3] = past_key[i][j];
+      outputs_block[2 + j * 3] = past_value[i][j];
+    }
+    ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks_cache[i].c_str(),
+                                inputs_block.data(), inputs_block.size(),
+                                outputs_block.data(), outputs_block.size(),
+                                true, false);
+    assert(ret);
+    bm_thread_sync(bm_handle);
+  }
+
+  // forward lmhead
+  ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm[0], 1,
+                              &outputs_lm[0], 1, true, false);
+  assert(ret);
+  bm_thread_sync(bm_handle);
+
+  int token = 0;
+  bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm[0].device_mem);
+  return token;
+}
+
+void ChatGLM::build_system_prompt() {
+  history_tokens.clear();
+  history_tokens.insert(history_tokens.end(), head_prompt.begin(),
+                        head_prompt.end());
+  history_tokens.insert(history_tokens.end(), system_prompt.begin(),
+                        system_prompt.end());
+}
+
+void ChatGLM::chat() {
+  while (true) {
+    std::cout << "\nQuestion: ";
+    std::string input_str;
+    std::getline(std::cin, input_str);
+    if (input_str == "exit") {
+      break;
+    }
+    std::cout << "\nAnswer: " << std::flush;
+    answer(input_str);
+    std::cout << std::endl;
+  }
+}
+
+void ChatGLM::answer(const std::string &input_str) {
+  // auto time_0 = std::chrono::system_clock::now();
+  int tok_num = 0;
+  std::vector<int> tokens;
+  std::vector<int> prompt{64795, 30910, 13};
+  sentencepiece.Encode(input_str, &tokens);
+  tokens.insert(tokens.begin(), prompt.begin(), prompt.end());
+  tokens.push_back(64796);
+  if (history_tokens.size() == 0) {
+    build_system_prompt();
+  }
+  history_tokens.insert(history_tokens.end(), tokens.begin(), tokens.end());
+
+  if (history_tokens.empty()) {
+    printf("Sorry: your question is too wierd!!\n");
+    round = 0;
+    history_tokens.clear();
+    return;
+  }
+  // make sure token not too large
+  if ((int)history_tokens.size() > SEQLEN - 10) {
+    // reset
+    history_tokens.clear();
+    if (round == 0) {
+      printf("Error: your question is too large!\n");
+      return;
+    }
+    round = 0;
+    answer(input_str);
+    return;
+  }
+  int pre_token = 0;
+  auto t0 = std::chrono::system_clock::now();
+  int token = forward_first(history_tokens);
+  auto t1 = std::chrono::system_clock::now();
+  while (token != EOS && token_length < SEQLEN) {
+    std::string pre_word;
+    std::string word;
+    std::vector<int> pre_ids = {pre_token};
+    std::vector<int> ids = {pre_token, token};
+    sentencepiece.Decode(pre_ids, &pre_word);
+    sentencepiece.Decode(ids, &word);
+    std::string diff = word.substr(pre_word.size());
+    history_tokens.emplace_back(token);
+    std::cout << diff << std::flush;
+    if (token_length < SEQLEN) {
+      token_length++;
+    }
+    tok_num++;
+    token = forward_next(token);
+  }
+  auto t2 = std::chrono::system_clock::now();
+  auto use0 = std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0);
+  auto use1 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
+  printf("\n\nfirst token latency: %f s", (use0.count() * 1e-6));
+  printf("\nspeed: %f token/s\n", tok_num / (use1.count() * 1e-6));
+  if (token_length >= SEQLEN) {
+    history_tokens.clear();
+    round = 0;
+  } else {
+    round++;
+  }
+}
+
+static void split(const std::string &s, const std::string &delim,
+                  std::vector<std::string> &ret) {
+  size_t last = 0;
+  size_t index = s.find_first_of(delim, last);
+  while (index != std::string::npos) {
+    ret.push_back(s.substr(last, index - last));
+    last = index + 1;
+    index = s.find_first_of(delim, last);
+  }
+  if (last < s.length()) {
+    ret.push_back(s.substr(last));
+  }
+}
+
+static std::vector<int> parseCascadeDevices(const std::string &str) {
+  std::vector<int> devices;
+  std::vector<std::string> sub_str;
+  split(str, ",", sub_str);
+  for (auto &s : sub_str) {
+    devices.push_back(std::atoi(s.c_str()));
+  }
+  return devices;
+}
+
+void Usage() {
+  printf("Usage:\n"
+         "  --help         : Show help info.\n"
+         "  --model        : Set model path \n"
+         "  --tokenizer    : Set tokenizer path \n"
+         "  --devid        : Set devices to run for model, e.g. 1,2. if not "
+         "set, use 0\n");
+}
+
+void processArguments(int argc, char *argv[], std::string &model_path, std::string &tokenizer_path,
+                      std::vector<int> &devices) {
+  struct option longOptions[] = {{"model", required_argument, nullptr, 'm'},
+                                 {"tokenizer", required_argument, nullptr, 't'},
+                                 {"devid", required_argument, nullptr, 'd'},
+                                 {"help", no_argument, nullptr, 'h'},
+                                 {nullptr, 0, nullptr, 0}};
+
+  int optionIndex = 0;
+  int option;
+
+  while ((option = getopt_long(argc, argv, "m:t:d:h:", longOptions,
+                               &optionIndex)) != -1) {
+    switch (option) {
+    case 'm':
+      model_path = optarg;
+      break;
+    case 't':
+      tokenizer_path = optarg;
+      break;
+    case 'd':
+      devices = parseCascadeDevices(optarg);
+      break;
+    case 'h':
+      Usage();
+      exit(EXIT_FAILURE);
+    case '?':
+      Usage();
+      exit(EXIT_FAILURE);
+    default:
+      exit(EXIT_FAILURE);
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  // set your bmodel path here
+  printf("Demo for ChatGLM in BM1684X, support ChatGLM1/2/3\n");
+  std::string model_path;
+  std::string tokenizer_path;
+  std::vector<int> devices = {0};
+  processArguments(argc, argv, model_path, tokenizer_path, devices);
+  if (model_path.empty()) {
+    Usage();
+    exit(EXIT_FAILURE);
+  }
+
+  ChatGLM glm;
+  printf("Init Environment ...\n");
+  glm.init(devices, model_path, tokenizer_path);
+  printf("==========================\n");
+  glm.chat();
+  glm.deinit();
+  return 0;
+}
diff --git a/models/ChatGLM3/support/include/bmdef.h b/models/ChatGLM3/support/include/bmdef.h
index 68f251e..d41a4b0 100755
--- a/models/ChatGLM3/support/include/bmdef.h
+++ b/models/ChatGLM3/support/include/bmdef.h
@@ -98,7 +98,9 @@ typedef struct bm_net_info_s {
   size_t* max_input_bytes;       /* max_input_bytes[0]/ [1] / ... / [input_num-1] */
   size_t* max_output_bytes;      /* max_output_bytes[0] / [1] / ... / [output_num-1] */
   int* input_zero_point;         /* input_zero_point[0] / [1] / .../ [input_num-1] */
-  int* output_zero_point;        /* output_zero_point[0] / [1] / .../ [input_num-1] */
+  int* output_zero_point;        /* output_zero_point[0] / [1] / .../ [output_num-1] */
+  int *input_loc_devices;         /* input_loc_device[0] / [1] / .../ [input_num-1] */
+  int *output_loc_devices;        /* output_loc_device[0] / [1] / .../ [output_num-1] */
 } bm_net_info_t;
 
 typedef struct api_info_s {
diff --git a/models/ChatGLM3/support/include/bmlib_runtime.h b/models/ChatGLM3/support/include/bmlib_runtime.h
index 60094e1..071cfe0 100755
--- a/models/ChatGLM3/support/include/bmlib_runtime.h
+++ b/models/ChatGLM3/support/include/bmlib_runtime.h
@@ -1628,6 +1628,7 @@ typedef struct bm_profile {
   unsigned long cdma_out_time;
   unsigned long cdma_out_counter;
   unsigned long tpu_process_time;
+  unsigned long tpu1_process_time;
   unsigned long sent_api_counter;
   unsigned long completed_api_counter;
 #else
@@ -1636,6 +1637,7 @@ typedef struct bm_profile {
   unsigned long long cdma_out_time;
   unsigned long long cdma_out_counter;
   unsigned long long tpu_process_time;
+  unsigned long long tpu1_process_time;
   unsigned long long sent_api_counter;
   unsigned long long completed_api_counter;
 #endif
diff --git a/models/ChatGLM3/support/include/bmruntime_interface.h b/models/ChatGLM3/support/include/bmruntime_interface.h
index 54fd90d..cbf6964 100755
--- a/models/ChatGLM3/support/include/bmruntime_interface.h
+++ b/models/ChatGLM3/support/include/bmruntime_interface.h
@@ -59,6 +59,12 @@ it will alloc device mem to tensor->device_mem, so user should bmrt_free_device(
 tensor->device_mem) to free it.*/
 DECL_EXPORT bool bmrt_tensor(bm_tensor_t* tensor, void* p_bmrt, bm_data_type_t dtype, bm_shape_t shape);
 
+/*
+fill a tensor with data type and shape, and st_mode = 0 as default.
+tensor and p_bmrt should not be NULL, shape count should not be 0.
+it will alloc device mem to tensor->device_mem on devid-th device.*/
+DECL_EXPORT bool bmrt_tensor_ex(bm_tensor_t* tensor, void* p_bmrt, int devid, bm_data_type_t dtype, bm_shape_t shape);
+
 /* fill a tensor with device mem existed, tensor byte size should not large than device mem size */
 DECL_EXPORT void bmrt_tensor_with_device(bm_tensor_t* tensor, bm_device_mem_t device_mem,
                              bm_data_type_t dtype, bm_shape_t shape);
@@ -345,6 +351,52 @@ DECL_EXPORT bool bmrt_launch_tensor_multi_cores(
     const int *core_list,
     int core_num);
 
+/**
+ *  @name    bmrt_memcpy_s2d_parallel
+ *  @brief   To copy data from system memory to muti-devices memory in parallel
+ *  @ingroup bmruntime
+ *
+ *  This API only could be used when the p_bmrt is created with bmrt_create_ex on multi devices.
+ *  After calling this API, datas[:tensor_num[0]] will be copied to the first device, and
+ *  datas[tensor_num[0]:tensor_num[0]+tensor_num[1]] will be copied to the second device and so on.
+ *  The process of copying data to different devices is done in parallel and to the same device is in sequence.
+ * 
+ *  @param [in]     p_bmrt      Bmruntime that had been created with multi bm_handles
+ *  @param [in]     tensors     Array of tensors that will be copied to devices
+ *  @param [in]     datas       Array of satas allocated in system memory
+ *  @param [in]     tensor_num  Array of tensor_num that will be copied to each device
+ *  @param [in]     device_num  Device number
+*/
+DECL_EXPORT bool bmrt_memcpy_s2d_parallel(
+    void *p_bmrt,
+    bm_tensor_t tensors[],
+    void *datas[],
+    int tensor_num[],
+    int device_num);
+
+/**
+ *  @name    bmrt_memcpy_d2s_parallel
+ *  @brief   To copy data from muti-devices memory to system memory in parallel
+ *  @ingroup bmruntime
+ *
+ *  This API only could be used when the p_bmrt is created with bmrt_create_ex on multi devices.
+ *  After calling this API, tensors on the first device will be copied to datas[:tensor_num[0]] , and
+ *  tensors on the second device will be copied to datas[tensor_num[0]:tensor_num[0]+tensor_num[1]] and so on.
+ *  The process of copying data from different devices is done in parallel and from the same device is in sequence.
+ * 
+ *  @param [in]     p_bmrt      Bmruntime that had been created with multi bm_handles
+ *  @param [in]     datas       Array of satas allocated in system memory
+ *  @param [in]     tensors     Array of tensors that will be copied from devices
+ *  @param [in]     tensor_num  Array of tensor_num that will be copied from each device
+ *  @param [in]     device_num  Device number
+*/
+DECL_EXPORT bool bmrt_memcpy_d2s_parallel(
+    void *p_bmrt,
+    void *datas[],
+    bm_tensor_t tensors[],
+    int tensor_num[],
+    int device_num);
+
 #if defined (__cplusplus)
 }
 #endif
diff --git a/models/Llama2/demo/demo.cpp b/models/Llama2/demo/demo.cpp
index 1f193c4..bc98465 100755
--- a/models/Llama2/demo/demo.cpp
+++ b/models/Llama2/demo/demo.cpp
@@ -73,7 +73,7 @@ void LLama2::load_sentencepiece(std::string tokenizer_path) {
   printf("Done!\n");
 }
 
-void LLama2::init(const std::vector<int> &devices, std::string model, std::string tokenizer_path) {
+void LLama2::init(const std::vector<int> &devices, std::string model_path, std::string tokenizer_path) {
   // load tokenizer
   load_sentencepiece(tokenizer_path);
 
@@ -97,8 +97,8 @@ void LLama2::init(const std::vector<int> &devices, std::string model, std::strin
   assert(NULL != p_bmrt);
 
   // load bmodel by file
-  printf("Model[%s] loading ....\n", model.c_str());
-  bool ret = bmrt_load_bmodel(p_bmrt, model.c_str());
+  printf("Model[%s] loading ....\n", model_path.c_str());
+  bool ret = bmrt_load_bmodel(p_bmrt, model_path.c_str());
   assert(true == ret);
   printf("Done!\n");
 
diff --git a/models/Qwen/demo/demo_parallel.cpp b/models/Qwen/demo/demo_parallel.cpp
index 3e405be..5ddd3e0 100755
--- a/models/Qwen/demo/demo_parallel.cpp
+++ b/models/Qwen/demo/demo_parallel.cpp
@@ -18,6 +18,8 @@
 #include "bmruntime_interface.h"
 #include <getopt.h>
 
+static const uint16_t BF16_NEG_10000 = 0xC61C; // -9984 by bfloat16
+
 // #define EXPORT_RESULTS
 #ifdef EXPORT_RESULTS
 #include "cnpy.h"
@@ -58,9 +60,6 @@ void dump_tensor(bm_handle_t bm_handle, bm_tensor_t &tensor) {
   ptr[0] = ptr[0];
 }
 
-
-static const uint16_t BF16_NEG_10000 = 0xC61C; // -9984 by bfloat16
-
 static const std::string TOKENIZER_MODEL = "qwen.tiktoken";
 
 class QwenChat {
@@ -81,11 +80,11 @@ class QwenChat {
   std::vector<bm_handle_t> handles;
   bm_handle_t bm_handle;
   void *p_bmrt;
-  std::vector<const bm_net_info_t *> net_blocks;
-  std::vector<const bm_net_info_t *> net_blocks_cache;
   const bm_net_info_t *net_embed;
   const bm_net_info_t *net_embed_cache;
   const bm_net_info_t *net_lm;
+  std::vector<const bm_net_info_t *> net_blocks;
+  std::vector<const bm_net_info_t *> net_blocks_cache;
   std::vector<bm_tensor_t> inputs_embed_512, outputs_embed_512;
   std::vector<bm_tensor_t> inputs_pid, next_pid, inputs_attention, next_attention;
   std::vector<std::vector<bm_tensor_t>> past_keys, past_values;
@@ -142,8 +141,8 @@ void QwenChat::init(const std::vector<int> &devices, std::string model) {
 
 
   // embed, lm_head
-  name_embed = "embedding_1";
-  name_embed_cache = "embedding_0";
+  name_embed = "embedding";
+  name_embed_cache = "embedding_cache";
   name_lm = "lm_head";
   net_embed = bmrt_get_network_info(p_bmrt, name_embed.c_str());
   net_embed_cache = bmrt_get_network_info(p_bmrt, name_embed_cache.c_str());
@@ -287,26 +286,6 @@ void QwenChat::deinit() {
   }
 }
 
-// after first block, move real result to end of mem
-void QwenChat::move2end(const bm_tensor_t &kv) {
-  if (token_length >= SEQLEN) {
-    return;
-  }
-  auto total_size = bm_mem_get_device_size(kv.device_mem);
-  auto bytes = total_size / SEQLEN;
-  auto real_size = token_length * bytes;
-  auto mem =
-      bm_mem_from_device(bm_mem_get_device_addr(kv.device_mem), real_size);
-  auto buffer = new uint8_t[real_size];
-  auto dst = new uint8_t[total_size];
-  bm_memcpy_d2s(bm_handle, (void *)buffer, mem);
-  memset(dst, 0, total_size - real_size);
-  memcpy(dst + total_size - real_size, buffer, real_size);
-  bm_memcpy_s2d(bm_handle, kv.device_mem, (void *)dst);
-  delete[] buffer;
-  delete[] dst;
-}
-
 int QwenChat::forward_first(std::vector<int> &tokens) {
   std::vector<int> input_ids(SEQLEN, 0);
   std::vector<int> position_id(SEQLEN, 0);