diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/.gitmodules b/.gitmodules old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/models/Baichuan2/README.md b/models/Baichuan2/README.md new file mode 100644 index 0000000..635ccb5 --- /dev/null +++ b/models/Baichuan2/README.md @@ -0,0 +1,182 @@ +![image](./assets/sophgo_chip.png) + +# Baichuan2-TPU + +本项目实现BM1684X部署语言大模型[Baichuan2-7B](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)。通过[TPU-MLIR](https://github.com/sophgo/tpu-mlir)编译器将模型转换成bmodel,并采用c++代码将其部署到BM1684X的PCIE环境,或者SoC环境。 + +下文中默认是PCIE环境;如果是SoC环境,按提示操作即可。 + +# 目录说明 +``` +. +├── README.md #使用说明 +├── requirements.txt #需要使用的python wheel包 +├── assets +├── compile +│   ├── compile.sh #用来编译TPU模型的脚本 +│   ├── export_onnx_fast.py #用来导出onnx的脚本 +│   ├── modeling_baichuan.py #替换Baichuan2-7B-chat的对应文件的备份 +│   └── torch_inference.py #torch推理脚本 +├── demo #Baichuan2 c++代码文件 +│   ├── CMakeLists.txt +│   └── demo.cpp #主程序 +├── src #编译依赖库 +│   ├── include +│   ├── lib_pcie +│   └── lib_soc +├── model #模型文件(bmodel需下载) +│   ├── baichuan2-7b-test_int8.bmodel +│   └── tokenizer.model +└── web_demo #web demo,提供网页对话示例 + ├── chat.cpp + ├── chat.py + ├── CMakeLists.txt + └── web_demo.py +``` +---------------------------- + +# 【阶段一】模型编译 + +## 注意点 +* 模型编译必须要在docker内完成,无法在docker外操作 + +### 步骤一:模型下载 +Baichuan2模型在hugging face上完全开源,供用户下载使用。请根据官网下载步骤进行模型与权重的下载。 +```bash +# Make sure you have git-lfs installed (https://git-lfs.com) +git lfs install +git clone https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat + +# if you want to clone without large files – just their pointers +# prepend your git clone with the following env var: +GIT_LFS_SKIP_SMUDGE=1 +``` + +### 步骤二:下载docker + +下载docker,启动容器,如下: + +``` shell +docker pull sophgo/tpuc_dev:latest + +# myname1234 is just an example, you can set your own name +docker run --privileged --name myname1234 -v $PWD:/workspace -it sophgo/tpuc_dev:latest +``` + +### 步骤三:下载TPU-MLIR代码并编译 + +``` shell +git clone git@github.com:sophgo/tpu-mlir.git +cd tpu-mlir +source ./envsetup.sh +./build.sh +``` +* PS:重新进入docker环境并且需要编译模型时,必须在此路径下执行上述`source ./envsetup.sh` 和 `./build.sh`才能完成后续模型编译。 + +### 步骤四:下载本项目,安装requirements.txt +下载transfomers、sentencepiece、Baichuan2-TPU以及百度网盘里的.bin模型,并替换transformers里面的modeling_baichuan.py + +``` shell +git clone https://github.com/sophgo/Baichuan2-TPU.git +cd Baichuan2 +pip install -r requirements.txt +``` + +### 步骤五:替换modeling_baichuan.py, 修改config.json, 生成onnx文件 +修改Baichuan2-7B-chat项目中config.json文件中max_position_embeddings与model_max_length,从4096变为512 + +``` shell +cd compile +cp modeling_baichuan.py $BAICHUAN2_PATH +python export_onnx_fast.py --model_path your_model_path +``` + +* PS1:your_model_path 指的是原模型下载后的地址, 如:"../../torch2onnx/Baichuan2-7B-Chat", 可以根据需要选择使用7b模型还是13b模型。 +* PS2:如果你想要debug,而不是一下子生成完成全部的onnx模型,可以将240行的num_layers改成1, 并结合函数对比单个block情况下是否可以和 + +### 步骤六:生成bmodel文件 + +生成模型 + +``` shell +./compile.sh --mode int8 +``` + +* PS1:编译完成后最终会在Llama2-TPU/compile路径下生成名为baichuan2-{X}b_{Y}_{Z}dev.bmodel,其中X为7或13,Y为`compile.sh`时选择的`mode`的数据类型,Z为推理的芯片数量(如果不指定num_device, 会省略{Z}dev的部分) +* PS2:生成bmodel耗时大概3小时以上,建议64G内存以及200G以上硬盘空间,不然很可能OOM或者no space left +* PS3:目前给定的lib_pcie和lib_soc部分仅包含单芯的动态库,多芯部分会在后续更新 + +---------------------------- + +# 阶段二:可执行文件生成(可以跳过) + +## 准备 +* bmodel模型准备:经过阶段一后将得到编译好的bmodel文件【也可以使用我们提供的现成编译好的bmodel文件】,下载方式为: +```shell +cd Baichuan2-TPU/model +pip3 install dfss +# baichuan2-7B +python3 -m dfss --url=open@sophgo.com:sophon-demo/baichuan2/baichuan2-7b-test_int8.bmodel +``` +将得到编译好的int8单芯bmodel模型文件。 + +## 编译程序(C++版本) + +执行如下编译,默认是PCIE版本: + +```shell +cd Baichuan2-TPU/demo +mkdir build +cd build +cmake .. +make +``` + +如果是SoC版本,有两种编译方法: + +方法1:直接将demo目录拷贝到SoC环境,按以上步骤编译(推荐) + +方法2:docker中交叉编译,如下操作 + +```shell +wget https://releases.linaro.org/components/toolchain/binaries/7.5-2019.12/aarch64-linux-gnu/gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu.tar.xz +tar -xvf gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu.tar.xz +mv gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu /opt/aarch64-linux-gnu-7.5.0 +cd Baichuan2-TPU/demo +mkdir build +cd build +cmake .. -DTARGET_ARCH=soc # soc 只有一颗芯片,因此不支持多芯编译 +make -j +``` + +编译生成llama2可执行程序。 + +运行`baichuan2`: +```shell +./baichuan2 --model ../model/baichuan2-7b-test_int8.bmodel --dev dev_id +``` + +## 编译程序(Python Web版本)【单芯】 + +```shell +pip install gradio==3.39.0 +cd Baichuan2-TPU/web_demo +mkdir build +cd build +cmake .. +make -j +``` + +编译成功会在`build`文件夹下生成`libtpuchat.so*`, 此时可以在web_demo.py中指定bmodel\_path token\_path device\_id, lib_path(编译生产的`libtpuchat.so*`文件, 默认路径是`./build`下), 以及dev_id。 +```python +python web_demo.py +``` +即可成功运行web的demo。 +* PS:在用户不修改上述token\_path的lib\_path的存放路径前提下只需指定bmodel\_path即可运行程序。 + +如果是SoC环境,参考C++版本 + +* PS:尽量下载gradio==3.39.0版本,不然会出现各种问题!! + +# 常见问题 +* 请根据实际block数目调整`demo/chat`中或者`web_demo/chat.cpp`中的NUM_LAYERS,默认是使用Baichuan2-7B(NUM_LAYERS=32) \ No newline at end of file diff --git a/models/Baichuan2/compile/compile.sh b/models/Baichuan2/compile/compile.sh new file mode 100755 index 0000000..c71c28a --- /dev/null +++ b/models/Baichuan2/compile/compile.sh @@ -0,0 +1,186 @@ +#!/bin/bash +set -ex +models= +mode="f16" +folder="tmp" +num_device=1 +mode_args="" +device_args="" +quantize_args="--quantize F16" +name="" +num_layers= +out_model=$name.bmodel + +if [ -z "$name" ]; then + name="baichuan2-7b" + echo "Compile Baichuan2-7B" +else + name="baichuan2-13b" + echo "Compile Baichuan2-13B" +fi + +while [[ $# -gt 0 ]]; do + key="$1" + + case $key in + --mode) + mode="$2" + shift 2 + ;; + --num_device) + num_device="$2" + shift 2 + ;; + --name) + name="$2" + shift 2 + ;; + *) + echo "Invalid option: $key" >&2 + exit 1 + ;; + :) + echo "Option -$OPTARG requires an argument." >&2 + exit 1 + ;; + esac +done + +if [ x$mode == x"int8" ] || [ x$mode == x"int4" ]; then + if [ x$mode == x"int8" ]; then + quantize_args="--quantize W8F16" + else + quantize_args="--quantize W4BF16 --q_group_size 64" + fi + out_model=$name'_'$mode'.bmodel' +fi + +if [ x$name == x"baichuan2-7b" ] || [ x$name == x"baichuan2-13b" ]; then + if [ x$name == x"baichuan2-7b" ]; then + num_layers=32 + else + num_layers=40 + fi +fi + +if [ x$num_device != x1 ]; then + device_args="--num_device $num_device" + out_model=$name'_'$mode'_'$num_device'dev.bmodel' +else + out_model=$name'_'$mode'_1dev.bmodel' +fi + +outdir=${folder}/embedding +mkdir -p $outdir +pushd $outdir + +seqlen=512 +model_transform.py \ + --model_name embedding \ + --model_def ../embedding.onnx \ + --input_shapes [[$seqlen]] \ + --mlir embedding_${seqlen}.mlir + + +model_deploy.py \ + --mlir embedding_$seqlen.mlir \ + --quantize F16 \ + --chip bm1684x \ + $device_args \ + --model embedding_${seqlen}_f16.bmodel + +model_transform.py \ + --model_name embedding_cache \ + --model_def ../embedding.onnx \ + --input_shapes [[1]] \ + --mlir embedding_1.mlir + + +model_deploy.py \ + --mlir embedding_1.mlir \ + --quantize F16 \ + --chip bm1684x \ + $device_args \ + --model embedding_1_f16.bmodel + +rm *.npz + +models=$models' '$outdir'/embedding_1_f16.bmodel '$outdir'/embedding_'$seqlen'_f16.bmodel ' + +popd + +echo $models + +outdir=${folder}/$mode"_"$num_device"dev"/lm_head +mkdir -p $outdir +pushd $outdir + +model_transform.py \ + --model_name lm_head \ + --model_def ../../lm_head.onnx \ + --mlir lm_head.mlir + + +model_deploy.py \ + --mlir lm_head.mlir \ + --quantize F16 \ + --chip bm1684x \ + --model lm_head.bmodel + +rm *.npz + +models=${models}${outdir}'/lm_head.bmodel ' +popd + +echo $models + +outdir=${folder}/$mode"_"$num_device"dev"/block +mkdir -p $outdir + +pushd $outdir +mkdir -p $outdir + +for ((i=0; i<$num_layers; i++)) +do + +model_transform.py \ + --model_name block_$i \ + --model_def ../../block_$i.onnx \ + --mlir block_$i.mlir + +model_deploy.py \ + --mlir block_$i.mlir \ + $quantize_args \ + --chip bm1684x \ + --quant_output \ + --quant_output_list 2,3 \ + $device_args \ + --model block_$i.bmodel + +model_transform.py \ + --model_name block_cache_$i \ + --model_def ../../block_cache_${i}.onnx \ + --mlir block_cache_$i.mlir + +model_deploy.py \ + --mlir block_cache_$i.mlir \ + $quantize_args \ + --chip bm1684x \ + --quant_input \ + --quant_output \ + --quant_input_list 4,5 \ + --quant_output_list 2,3 \ + $device_args \ + --model block_cache_$i.bmodel + +rm *.npz +# rm ../../block_$i.onnx +# rm ../../block_cache_$i.onnx + +models=${models}${outdir}'/block_'$i'.bmodel '$outdir'/block_cache_'$i'.bmodel ' + +done +popd +echo $models + +model_tool --combine $models -o $out_model diff --git a/models/Baichuan2/compile/export_onnx_fast.py b/models/Baichuan2/compile/export_onnx_fast.py new file mode 100755 index 0000000..dbab131 --- /dev/null +++ b/models/Baichuan2/compile/export_onnx_fast.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +# ============================================================================== +# +# Copyright (C) 2023 Sophgo Technologies Inc. All rights reserved. +# +# TPU-MLIR is licensed under the 2-Clause BSD License except for the +# third-party components. +# +# ============================================================================== + +import os +import datetime +import math +import unittest +import torch +import random +import sys +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.generation.utils import GenerationConfig +import numpy as np +import argparse + +# folder = "./tmp" +# model_path = "/home/junqian/workspace/llm/baichuan2-7B/Baichuan2-7B-Chat" +parser = argparse.ArgumentParser(description='export Baichuan2 onnx.') +parser.add_argument('--model_path', type=str, default="../baichuan2-7B/Baichuan2-7B-Chat/", help='path to the torch model.') +parser.add_argument('--max_length', type=int, default=512, help="max sequence length") + +args = parser.parse_args() + +model_path = args.model_path +MAX_LEN = args.max_length +folder = "./tmp" + +origin_model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) +origin_model.generation_config = GenerationConfig.from_pretrained(model_path) +origin_model.eval() +transformer = origin_model.model +config = origin_model.config + + +for param in origin_model.parameters(): + param.requires_grad = False + +num_layers = config.num_hidden_layers +hidden_size = config.hidden_size +num_attention_heads = config.num_attention_heads +head_dim = hidden_size // num_attention_heads +layers = transformer.layers +tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True) + + +def set_seed(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + np.random.seed(seed) + random.seed(seed) + +class Embedding(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, input_ids): + return transformer.embed_tokens(input_ids) + + +class Block(torch.nn.Module): + + def __init__(self, layer_id): + super().__init__() + # params + self.layer_id = layer_id + self.layer = layers[layer_id] + + def forward(self, hidden_states, position_ids, attention_mask): + hidden_states, past_kv = self.layer(hidden_states, + attention_mask, + position_ids, + use_cache=True) + past_k, past_v = past_kv + return hidden_states, past_k, past_v + + +class BlockCache(torch.nn.Module): + + def __init__(self, layer_id): + super().__init__() + # params + self.layer_id = layer_id + self.layer = layers[layer_id] + + def forward(self, hidden_states, position_ids, attention_mask, past_k, + past_v): + hidden_states, past_kv = self.layer(hidden_states, + attention_mask, + position_ids=position_ids, + past_key_value=(past_k, past_v), + use_cache=True) + past_k, past_v = past_kv + return hidden_states, past_k, past_v + + +class LmHead(torch.nn.Module): + + def __init__(self): + super().__init__() + + def forward(self, hidden_states): + hidden_states = transformer.norm(hidden_states) + m_logits = origin_model.lm_head(hidden_states) + _, token = torch.topk(m_logits, 1) + return token + + +def convert_block(layer_id): + # input + # MAX_LEN + 1 for model combine + hidden_states = torch.randn((1, MAX_LEN, hidden_size)) + position_ids = torch.tensor([range(MAX_LEN)], dtype=torch.long) + attention_mask = -1000 * torch.ones((1, 1, MAX_LEN, MAX_LEN), dtype=torch.float32).triu(diagonal=1) + model = Block(layer_id) + # hiddeng_states = model(input_ids, position_ids) + + torch.onnx.export( + model, (hidden_states, position_ids, attention_mask), + f'./tmp/block_{layer_id}.onnx', + verbose=False, + input_names=['input_states', 'position_ids', 'attention_mask'], + output_names=['hidden_states', 'past_k', 'past_v'], + do_constant_folding=True, + opset_version=15) + + +def convert_block_cache(layer_id): + # input + np.random.seed(42) + hidden_states = torch.randn((1, 1, hidden_size)) + position_ids = torch.tensor([range(1)], dtype=torch.long) + attention_mask = -1000 * torch.ones((1, 1, 1, MAX_LEN + 1), dtype=torch.float32).triu(diagonal=0) + past_k = torch.randn((1, MAX_LEN, num_attention_heads, head_dim)) + past_v = torch.randn((1, MAX_LEN, num_attention_heads, head_dim)) + model = BlockCache(layer_id) + # hiddeng_states = model(input_ids, position_ids) + + torch.onnx.export( + model, (hidden_states, position_ids, attention_mask, past_k, past_v), + f'./tmp/block_cache_{layer_id}.onnx', + verbose=False, + input_names=[ + 'input_states', 'position_ids', 'attention_mask', 'history_k', + 'history_v' + ], + output_names=['hidden_states', 'past_k', 'past_v'], + do_constant_folding=True, + opset_version=15) + + +def convert_embedding(): + model = Embedding() + torch.onnx.export(model, (torch.tensor([0, 1, 2, 3])), + f'./tmp/embedding.onnx', + verbose=False, + input_names=['input_ids'], + output_names=['input_embed'], + dynamic_axes={"input_ids": { + 0: "length" + }}, + do_constant_folding=True, + opset_version=15) + + +def convert_lm_head(): + model = LmHead() + input = torch.randn(1, hidden_size) + torch.onnx.export(model, (input), + f'./tmp/lm_head.onnx', + verbose=False, + input_names=['hidden_states'], + output_names=['token'], + do_constant_folding=True, + opset_version=15) + + +def test_net_with_mask(): + embed = Embedding() + blocks = [Block(i) for i in range(num_layers)] + block_kvs = [BlockCache(i) for i in range(num_layers)] + ids = tokenizer.encode('解释一下“温故而知新”这句话的意思。') + print("input ids:{}".format(ids)) + token_len = len(ids) + ids = ids + (MAX_LEN - token_len) * [0] + input_ids = torch.tensor(ids).view(MAX_LEN) + out = embed(input_ids).view(1, MAX_LEN, hidden_size) + position_ids = list(range(token_len)) + (MAX_LEN - token_len) * [0] + position_ids = torch.tensor([position_ids]) + attention_mask = -1000 * torch.ones((MAX_LEN, MAX_LEN)) + for i in range(token_len): + for j in range(token_len): + if j <= i: + attention_mask[i][j] = 0 + attention_mask = attention_mask.view(1, 1, MAX_LEN, MAX_LEN) + k_cache = [] + v_cache = [] + for i in range(num_layers): + out, k, v = blocks[i](out, position_ids, attention_mask) + k[:,MAX_LEN - token_len:] = k[:,:token_len] + v[:,MAX_LEN - token_len:] = v[:,:token_len] + k[:,:MAX_LEN - token_len] = 0 + v[:,:MAX_LEN - token_len] = 0 + k_cache.append(k) + v_cache.append(v) + out = out[:,token_len - 1:token_len].view(1, hidden_size) + lm = LmHead() + token = lm(out).view(1) + out_ids = [int(token)] + word = tokenizer._convert_id_to_token(int(token[0])) + print(word, end="") + while token > 2 and token_len < 64: + token_len += 1 + input_ids = torch.tensor([token]) + out = embed(input_ids).view(1, 1, hidden_size) + position_ids = torch.tensor([[token_len - 1]]) + attention_mask = -1000 * torch.ones((1, 1, 1, MAX_LEN + 1)) + attention_mask[:, :, :, MAX_LEN + 1 - token_len:] = 0 + for i in range(num_layers): + out, present_k_cache, present_v_cache = block_kvs[i](out, position_ids, + attention_mask, + k_cache[i], v_cache[i]) + new_k = torch.zeros(k_cache[i].shape) + new_v = torch.zeros(v_cache[i].shape) + new_k[:,MAX_LEN - token_len:MAX_LEN - 1] = k_cache[i][:,MAX_LEN - token_len + 1:] + new_v[:,MAX_LEN - token_len:MAX_LEN - 1] = v_cache[i][:,MAX_LEN - token_len + 1:] + new_k[:,MAX_LEN - 1:] = present_k_cache + new_v[:,MAX_LEN - 1:] = present_v_cache + k_cache[i] = new_k + v_cache[i] = new_v + token = lm(out).view(1) + out_ids.append(int(token)) + word = tokenizer._convert_id_to_token(int(token[0])) + print(word, end="") + print("\noutput_ids:{}".format(out_ids)) + +set_seed(42) +# test_net_with_mask() + +# create folder to store onnx +if not os.path.exists(folder): + os.makedirs(folder) + + +# export models +for i in range(num_layers): + print("convert_block_{}".format(i)) + convert_block_cache(i) + convert_block(i) +print("convert_embedding") +convert_embedding() +print("convert_lmhead") +convert_lm_head() diff --git a/models/Baichuan2/compile/modeling_baichuan.py b/models/Baichuan2/compile/modeling_baichuan.py new file mode 100644 index 0000000..5046dfc --- /dev/null +++ b/models/Baichuan2/compile/modeling_baichuan.py @@ -0,0 +1,792 @@ +# Copyright 2023 Baichuan Inc. All Rights Reserved. + +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from .configuration_baichuan import BaichuanConfig +from .generation_utils import build_chat_input, TextIterStreamer + +import math +from typing import List, Optional, Tuple, Union +from threading import Thread + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from torch.nn import functional as F +from transformers import PreTrainedModel, PretrainedConfig +from transformers.activations import ACT2FN +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from transformers.generation.utils import GenerationConfig +from transformers.utils import logging, ContextManagers + +import os +from contextlib import contextmanager +logger = logging.get_logger(__name__) + +try: + from xformers import ops as xops +except ImportError: + xops = None + logger.warning( + "Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers\npip install xformers." + ) + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + if len(mask.size()) == 3: + bsz, src_len, _ = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + expanded_mask = mask[:,None,:,:].expand(bsz, 1, tgt_len, src_len).to(dtype) + else: + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +class RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) + + return self.weight * hidden_states + + +class RotaryEmbedding(torch.nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim)) + self.max_seq_len_cached = max_position_embeddings + t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32) + freqs = torch.outer(t, self.inv_freq) + emb = torch.cat((freqs, freqs), dim=-1) + self.cos_cached = emb.cos()[None, None, :, :].to(torch.float32) + self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32) + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. + if seq_len > self.max_seq_len_cached: + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32) + freqs = torch.outer(t, self.inv_freq) + emb = torch.cat((freqs, freqs), dim=-1) + self.cos_cached = emb.cos()[None, None, :, :].to(torch.float32).to(x.device) + self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32).to(x.device) + elif self.cos_cached.device != x.device: + self.cos_cached = self.cos_cached.to(x.device) + self.sin_cached = self.sin_cached.to(x.device) + return ( + self.cos_cached[:, :, :seq_len, ...], + self.sin_cached[:, :, :seq_len, ...], + ) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos_, sin_, position_ids): + cos = cos_.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin_.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + cos = cos.transpose(1, 2) + sin = sin.transpose(1, 2) + q_embed = (q.float() * cos) + (rotate_half(q.float()) * sin) + k_embed = (k.float() * cos) + (rotate_half(k.float()) * sin) + return q_embed.to(q.dtype), k_embed.to(k.dtype) + + +class MLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + ): + super().__init__() + self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) + self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.act_fn = ACT2FN[hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +class Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + def __init__(self, config: BaichuanConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.max_position_embeddings = config.max_position_embeddings + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.W_pack = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + self.rotary_emb = RotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + proj = self.W_pack(hidden_states) + proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2) + query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim) + key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim) + value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim) + + kv_seq_len = key_states.shape[-3] + if past_key_value is not None: + kv_seq_len = kv_seq_len + past_key_value[0].shape[-3] + if past_key_value is not None: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len-1) + else: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + # [bsz, nh, t, hd] + past_kv = (key_states, value_states) if use_cache else None + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=1) + value_states = torch.cat([past_key_value[1], value_states], dim=1) + + + if xops is not None and self.training: + attn_weights = None + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + attn_output = xops.memory_efficient_attention( + query_states, key_states, value_states, attn_bias=xops.LowerTriangularMask() + ) + else: + with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = attention_mask) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + return attn_output, attn_weights, past_kv + + +class DecoderLayer(nn.Module): + def __init__(self, config: BaichuanConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = Attention(config=config) + self.mlp = MLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class BaichuanPreTrainedModel(PreTrainedModel): + config_class = BaichuanConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["DecoderLayer"] + _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, BaichuanModel): + module.gradient_checkpointing = value + + +class BaichuanModel(BaichuanPreTrainedModel): + def __init__(self, config: BaichuanConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + # embed positions + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class NormHead(nn.Module): + def __init__(self, hidden_size, vocab_size, bias=False): + super().__init__() + self.weight = nn.Parameter(torch.empty((vocab_size, hidden_size))) + nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + self.first_flag = True + + def forward(self, hidden_states): + if self.training: + norm_weight = nn.functional.normalize(self.weight) + self.first_flag = True + elif self.first_flag: + self.first_flag = False + self.weight.data = nn.functional.normalize(self.weight) + norm_weight = self.weight + else: + norm_weight = self.weight + return nn.functional.linear(hidden_states, norm_weight) + +_init_weights = True +@contextmanager +def no_init_weights(_enable=True): + global _init_weights + old_init_weights = _init_weights + if _enable: + _init_weights = False + try: + yield + finally: + _init_weights = old_init_weights + +class BaichuanForCausalLM(BaichuanPreTrainedModel): + def __init__(self, config, *model_args, **model_kwargs): + super().__init__(config, *model_args, **model_kwargs) + self.model = BaichuanModel(config) + + self.lm_head = NormHead(config.hidden_size, config.vocab_size, bias=False) + if hasattr(config, "quantization_config") and isinstance(config.quantization_config, dict) and config.quantization_config.get('load_in_4bit', False): + try: + from .quantizer import quantize_offline, init_model_weight_int4 + except ImportError: + raise ImportError(f"Needs QLinear to run quantize.") + quantize_offline(self, 4) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], + *model_args, + config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None, + cache_dir: Optional[Union[str, os.PathLike]] = None, + ignore_mismatched_sizes: bool = False, + force_download: bool = False, + local_files_only: bool = False, + token: Optional[Union[str, bool]] = None, + revision: str = "main", + use_safetensors: bool = None, + **kwargs, + ): + # Load config if we don't provide a configuration + if not isinstance(config, PretrainedConfig): + config_path = config if config is not None else pretrained_model_name_or_path + config, model_kwargs = cls.config_class.from_pretrained( + config_path, + cache_dir=cache_dir, + return_unused_kwargs=True, + force_download=force_download, + resume_download=False, + proxies=None, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder="", + _from_auto=False, + _from_pipeline=None, + **kwargs, + ) + else: + model_kwargs = kwargs + + if hasattr(config, "quantization_config") and config.quantization_config['load_in_4bit']: + try: + from .quantizer import init_model_weight_int4 + from accelerate import init_empty_weights, dispatch_model, infer_auto_device_map + from accelerate.utils import CustomDtype + from accelerate.utils import get_balanced_memory + except ImportError: + raise ImportError(f"Needs import model weight init func to run quantize.") + # Instantiate model. + init_contexts = [no_init_weights(_enable=True)] + init_contexts.append(init_empty_weights()) + with ContextManagers(init_contexts): + model = cls(config) + + model_file = os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin') + state_dict = torch.load(model_file, map_location="cpu") + model.is_quantized = True + + device_map = kwargs.pop("device_map", None) + torch_dtype = kwargs.pop("torch_dtype", None) + + if device_map is not None: + kwargs = {"no_split_module_classes": model._no_split_modules} + target_dtype = CustomDtype.INT4 + max_memory = get_balanced_memory( + model, + dtype=target_dtype, + low_zero=(device_map == "balanced_low_0"), + max_memory=None, + **kwargs, + ) + kwargs["max_memory"] = max_memory + device_map = infer_auto_device_map(model, dtype=target_dtype, **kwargs) + + model = init_model_weight_int4(config, model, state_dict) + + # Set model in evaluation mode to deactivate DropOut modules by default + model.eval() + # If it is a model with generation capabilities, attempt to load the generation config + if model.can_generate(): + try: + model.generation_config = GenerationConfig.from_pretrained( + pretrained_model_name_or_path, + cache_dir=cache_dir, + force_download=force_download, + resume_download=False, + proxies=None, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder="", + _from_auto=False, + _from_pipeline=None, + **kwargs, + ) + except (OSError, TypeError): + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) + pass + + if device_map is not None: + dispatch_model(model, device_map=device_map) + + return model + return super(BaichuanForCausalLM, cls).from_pretrained(pretrained_model_name_or_path, *model_args, + config=config, cache_dir=cache_dir, ignore_mismatched_sizes=ignore_mismatched_sizes, + force_download=force_download, local_files_only=local_files_only, token=token, revision=revision, + use_safetensors=use_safetensors, **kwargs) + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + softmax_normalizer = shift_logits.max(-1).values ** 2 + z_loss = self.config.z_loss_weight * softmax_normalizer.mean() + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + z_loss + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values: + input_ids = input_ids[:, -1:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past + + def quantize(self, bits: int): + try: + from .quantizer import quantize_online + except ImportError: + raise ImportError(f"Needs QLinear to run quantize.") + return quantize_online(self, bits) + + def chat(self, tokenizer, messages: List[dict], stream=False, + generation_config: Optional[GenerationConfig]=None): + generation_config = generation_config or self.generation_config + input_ids = build_chat_input(self, tokenizer, messages, generation_config.max_new_tokens) + if stream: + streamer = TextIterStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) + Thread(target=self.generate, kwargs=dict( + inputs=input_ids, streamer=streamer, + generation_config=generation_config, + )).start() + return streamer + else: + outputs = self.generate(input_ids, generation_config=generation_config) + response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True) + return response diff --git a/models/Baichuan2/compile/torch_inference.py b/models/Baichuan2/compile/torch_inference.py new file mode 100644 index 0000000..77c5319 --- /dev/null +++ b/models/Baichuan2/compile/torch_inference.py @@ -0,0 +1,16 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.generation.utils import GenerationConfig +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument('model_path', help='下载模型的绝对路径') +args = parser.parse_args() +model_path = args.model_path +tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True) +model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float32, trust_remote_code=True) +model.generation_config = GenerationConfig.from_pretrained(model_path) +messages = [] +messages.append({"role": "user", "content": "解释一下“温故而知新”"}) +response = model.chat(tokenizer, messages) +print(response) \ No newline at end of file diff --git a/models/Baichuan2/demo/CMakeLists.txt b/models/Baichuan2/demo/CMakeLists.txt new file mode 100755 index 0000000..5acf3bf --- /dev/null +++ b/models/Baichuan2/demo/CMakeLists.txt @@ -0,0 +1,38 @@ +cmake_minimum_required(VERSION 2.8) +project(baichuan2) + +if (NOT DEFINED TARGET_ARCH) + set(TARGET_ARCH pcie) +endif() + +set(CMAKE_INSTALL_PREFIX install) + +if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "aarch64") + add_definitions(-DSOC_TARGET) + link_directories(${PROJECT_SOURCE_DIR}/../src/lib_soc) + message("SoC mode, starting......") +elseif (${TARGET_ARCH} STREQUAL "pcie") + add_definitions(-DPCIE_TARGET) + link_directories(${PROJECT_SOURCE_DIR}/../src/lib_pcie) + message("Pcie mode, starting......") +elseif (${TARGET_ARCH} STREQUAL "soc") + add_definitions(-DSOC_TARGET) + set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc) + set(CMAKE_ASM_COMPILER aarch64-linux-gnu-gcc) + set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++) + link_directories(${PROJECT_SOURCE_DIR}/lib_soc) + message("SoC mode, starting......") +endif() + + + + +include_directories(${PROJECT_SOURCE_DIR}/../src/include) + +add_definitions(-DDEBUG --std=c++17 -fPIC -Wall -Werror) +set(CMAKE_BUILD_TYPE "Debug") + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) +add_executable(baichuan2 demo.cpp) +target_link_libraries(baichuan2 bmrt bmlib sentencepiece) + diff --git a/models/Baichuan2/demo/demo.cpp b/models/Baichuan2/demo/demo.cpp new file mode 100755 index 0000000..6956b6a --- /dev/null +++ b/models/Baichuan2/demo/demo.cpp @@ -0,0 +1,472 @@ +//===----------------------------------------------------------------------===// +// +// Copyright (C) 2023 Sophgo Technologies Inc. All rights reserved. +// +// TPU-MLIR is licensed under the 2-Clause BSD License except for the +// third-party components. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include "memory.h" +#include "sentencepiece/sentencepiece_processor.h" +#include "bmruntime_interface.h" +#include +#include + +static const int NUM_LAYERS = 32; +static const int MAX_LEN = 512; +static const float ATTENTION_MASK = -1000.; + +static const std::string TOKENIZER_MODEL = "../model/tokenizer.model"; + +// #define EXPORT_RESULTS +#ifdef EXPORT_RESULTS +#include "cnpy.h" +static cnpy::npz_t map; + +template +static void add_array(std::string name, bm_handle_t bm_handle, + const bm_device_mem_t &dst) { + std::vector data(dst.size / sizeof(T)); + bm_memcpy_d2s(bm_handle, data.data(), dst); + cnpy::npz_add_array(map, name, data); +} + +static void save_array(std::string filename) { + cnpy::npz_save_all(filename, map); +} +#endif + +class Baichuan2 { +public: + void init(const std::vector &devid, std::string model); + void chat(); + void deinit(); + +private: + void answer(const std::string &input_str); + int forward_first(std::vector &tokens); + int forward_next(); + void load_sentencepiece(); + +private: + std::vector handles; + bm_handle_t bm_handle; + void *p_bmrt; + sentencepiece::SentencePieceProcessor sentencepiece; + const bm_net_info_t *net_blocks[NUM_LAYERS]; + const bm_net_info_t *net_blocks_cache[NUM_LAYERS]; + const bm_net_info_t *net_embed; + const bm_net_info_t *net_embed_cache; + const bm_net_info_t *net_lm; + bm_tensor_t inputs_embed_512, outputs_embed_512; + bm_tensor_t inputs_lm, outputs_lm; + bm_tensor_t inputs_pid, next_pid, inputs_attention, next_attention; + bm_tensor_t past_key[NUM_LAYERS], past_value[NUM_LAYERS]; + bm_tensor_t present_key[NUM_LAYERS], present_value[NUM_LAYERS]; + bm_tensor_t present_key_cache, present_value_cache; + std::string name_embed; + std::string name_embed_cache; + std::string name_lm; + std::string name_blocks[NUM_LAYERS]; + std::string name_blocks_cache[NUM_LAYERS]; + int round = 0; + int token_length; + int EOS; + std::vector history; +}; + +void Baichuan2::load_sentencepiece() { + printf("Load %s ... ", TOKENIZER_MODEL.c_str()); + auto status = sentencepiece.Load(TOKENIZER_MODEL); + if (!status.ok()) { + std::cout << status.ToString() << std::endl; + exit(-1); + } + EOS = sentencepiece.eos_id(); + printf("Done!\n"); +} + +void Baichuan2::init(const std::vector &devices, std::string model) { + load_sentencepiece(); + // request bm_handle + std::cout << "Device [ "; + for (auto d : devices) { + std::cout << d << " "; + } + std::cout << "] loading ....\n"; + // int device_num = devices.size(); + for (auto d : devices) { + bm_handle_t h; + bm_status_t status = bm_dev_request(&h, d); + assert(BM_SUCCESS == status); + handles.push_back(h); + } + bm_handle = handles[0]; + // create bmruntime + p_bmrt = bmrt_create(bm_handle); + assert(NULL != p_bmrt); + + // load bmodel by file + printf("Model[%s] loading ....\n", model.c_str()); + bool ret = bmrt_load_bmodel(p_bmrt, model.c_str()); + assert(true == ret); + printf("Done!\n"); + // net names + name_embed = "embedding"; + name_embed_cache = "embedding_cache"; + name_lm = "lm_head"; + for (int i = 0; i < NUM_LAYERS; i++) { + name_blocks[i] = "block_" + std::to_string(i); + name_blocks_cache[i] = "block_cache_" + std::to_string(i); + } + + // net infos + net_embed = bmrt_get_network_info(p_bmrt, name_embed.c_str()); + net_embed_cache = bmrt_get_network_info(p_bmrt, name_embed_cache.c_str()); + net_lm = bmrt_get_network_info(p_bmrt, name_lm.c_str()); + for (int i = 0; i < NUM_LAYERS; i++) { + net_blocks[i] = bmrt_get_network_info(p_bmrt, name_blocks[i].c_str()); + net_blocks_cache[i] = + bmrt_get_network_info(p_bmrt, name_blocks_cache[i].c_str()); + } + + // net device mem + ret = bmrt_tensor(&inputs_embed_512, p_bmrt, net_embed->input_dtypes[0], + net_embed->stages[0].input_shapes[0]); + assert(true == ret); + + ret = bmrt_tensor(&outputs_embed_512, p_bmrt, net_embed->output_dtypes[0], + net_embed->stages[0].output_shapes[0]); + assert(true == ret); + + ret = bmrt_tensor(&inputs_pid, p_bmrt, net_blocks[0]->input_dtypes[1], + net_blocks[0]->stages[0].input_shapes[1]); + assert(true == ret); + + ret = bmrt_tensor(&inputs_attention, p_bmrt, net_blocks[0]->input_dtypes[2], + net_blocks[0]->stages[0].input_shapes[2]); + assert(true == ret); + + ret = bmrt_tensor(&next_pid, p_bmrt, net_blocks_cache[0]->input_dtypes[1], + net_blocks_cache[0]->stages[0].input_shapes[1]); + assert(true == ret); + + ret = + bmrt_tensor(&next_attention, p_bmrt, net_blocks_cache[0]->input_dtypes[2], + net_blocks_cache[0]->stages[0].input_shapes[2]); + assert(true == ret); + + for (int i = 0; i < NUM_LAYERS; i++) { + ret = bmrt_tensor(&past_key[i], p_bmrt, net_blocks[0]->output_dtypes[1], + net_blocks[0]->stages[0].output_shapes[1]); + assert(true == ret); + ret = bmrt_tensor(&past_value[i], p_bmrt, net_blocks[0]->output_dtypes[2], + net_blocks[0]->stages[0].output_shapes[2]); + assert(true == ret); + ret = bmrt_tensor(&present_key[i], p_bmrt, net_blocks[0]->output_dtypes[1], + net_blocks[0]->stages[0].output_shapes[1]); + assert(true == ret); + ret = bmrt_tensor(&present_value[i], p_bmrt, net_blocks[0]->output_dtypes[2], + net_blocks[0]->stages[0].output_shapes[2]); + assert(true == ret); + } + ret = bmrt_tensor(&present_key_cache, p_bmrt, net_blocks_cache[0]->output_dtypes[1], + net_blocks_cache[0]->stages[0].output_shapes[1]); + assert(true == ret); + ret = bmrt_tensor(&present_value_cache, p_bmrt, net_blocks_cache[0]->output_dtypes[2], + net_blocks_cache[0]->stages[0].output_shapes[2]); + assert(true == ret); + + ret = bmrt_tensor(&inputs_lm, p_bmrt, net_lm->input_dtypes[0], + net_lm->stages[0].input_shapes[0]); + assert(true == ret); + ret = bmrt_tensor(&outputs_lm, p_bmrt, net_lm->output_dtypes[0], + net_lm->stages[0].output_shapes[0]); + assert(true == ret); +} + +void Baichuan2::deinit() { + bm_free_device(bm_handle, inputs_embed_512.device_mem); + bm_free_device(bm_handle, outputs_embed_512.device_mem); + bm_free_device(bm_handle, inputs_lm.device_mem); + bm_free_device(bm_handle, outputs_lm.device_mem); + bm_free_device(bm_handle, inputs_pid.device_mem); + bm_free_device(bm_handle, next_pid.device_mem); + bm_free_device(bm_handle, inputs_attention.device_mem); + bm_free_device(bm_handle, next_attention.device_mem); + bm_free_device(bm_handle, present_key_cache.device_mem); + bm_free_device(bm_handle, present_value_cache.device_mem); + for (int i = 0; i < NUM_LAYERS; i++) { + bm_free_device(bm_handle, past_key[i].device_mem); + bm_free_device(bm_handle, past_value[i].device_mem); + bm_free_device(bm_handle, present_key[i].device_mem); + bm_free_device(bm_handle, present_value[i].device_mem); + } + bmrt_destroy(p_bmrt); + for (auto h : handles) { + bm_dev_free(h); + } +} + +int Baichuan2::forward_first(std::vector &tokens) { + int input_ids[MAX_LEN] = {0}; // start token + int position_id[MAX_LEN] = {0}; + float attention_mask[MAX_LEN * MAX_LEN] = {0}; + token_length = tokens.size(); + + std::copy(tokens.begin(), tokens.end(), input_ids); + for (int i = 0; i < token_length; i++) { + position_id[i] = i; + } + + for (int i = 0; i < MAX_LEN; i++) { + for (int j = 0; j < MAX_LEN; j++) { + if (j <= i && i < token_length) { + } else { + attention_mask[i * MAX_LEN + j] = ATTENTION_MASK; + } + } + } + + // forward embeding + bm_memcpy_s2d(bm_handle, inputs_embed_512.device_mem, (void *)input_ids); + auto ret = + bmrt_launch_tensor_ex(p_bmrt, name_embed.c_str(), &inputs_embed_512, 1, + &outputs_embed_512, 1, true, false); + assert(ret); + // float test_embed[MAX_LEN] = {0}; + // bm_memcpy_d2s(bm_handle, (void *)&test_embed, outputs_embed_512.device_mem); + bm_thread_sync(bm_handle); + + // forward blocks + bm_memcpy_s2d(bm_handle, inputs_pid.device_mem, (void *)position_id); + bm_memcpy_s2d(bm_handle, inputs_attention.device_mem, (void *)attention_mask); + auto inputs_embed = outputs_embed_512; + inputs_embed.shape = net_blocks[0]->stages[0].input_shapes[0]; + bm_tensor_t inputs_block[3] = {inputs_embed, inputs_pid, inputs_attention}; + for (int i = 0; i < NUM_LAYERS; i++) { + bm_tensor_t outputs_block[3] = {inputs_embed, past_key[i], past_value[i]}; + ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks[i].c_str(), inputs_block, 3, + outputs_block, 3, true, false); + assert(ret); + bm_thread_sync(bm_handle); + } + int bytes = inputs_embed.device_mem.size / MAX_LEN; + bm_memcpy_d2d_byte(bm_handle, inputs_lm.device_mem, 0, + inputs_embed.device_mem, (token_length - 1) * bytes, + bytes); + ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm, 1, + &outputs_lm, 1, true, false); + bm_thread_sync(bm_handle); + + int token = 0; + bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm.device_mem); + return token; +} + +int Baichuan2::forward_next() { + float attention_mask[MAX_LEN + 1] = {0}; + for (int i = token_length - 1; i < MAX_LEN; i++) { + attention_mask[i] = ATTENTION_MASK; + } + int32_t position_id = token_length - 1; + // embedding + outputs_lm.shape = net_embed_cache->stages[0].input_shapes[0]; + auto ret = bmrt_launch_tensor_ex(p_bmrt, name_embed_cache.c_str(), &outputs_lm, 1, + &inputs_lm, 1, true, false); + assert(ret); + bm_thread_sync(bm_handle); + + // blocks + bm_memcpy_s2d(bm_handle, next_attention.device_mem, (void *)attention_mask); + bm_memcpy_s2d(bm_handle, next_pid.device_mem, (void *)&position_id); + auto inputs_embed = inputs_lm; + inputs_embed.shape = net_blocks_cache[0]->stages[0].input_shapes[0]; + int bytes = bm_mem_get_device_size(present_key_cache.device_mem); + int token_offset = (token_length - 1) * bytes; + for (int i = 0; i < NUM_LAYERS; i++) { + bm_tensor_t inputs_block[5] = {inputs_embed, next_pid, next_attention, + past_key[i], past_value[i]}; + bm_tensor_t outputs_block[3] = {inputs_embed, present_key_cache, present_value_cache}; + ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks_cache[i].c_str(), + inputs_block, 5, outputs_block, 3, true, false); + assert(ret); + bm_thread_sync(bm_handle); + bm_memcpy_d2d_byte(bm_handle, past_key[i].device_mem, token_offset, + present_key_cache.device_mem, 0, + bytes); + bm_memcpy_d2d_byte(bm_handle, past_value[i].device_mem, token_offset, + present_value_cache.device_mem, 0, + bytes); + } + outputs_lm.shape = net_lm->stages[0].output_shapes[0]; + ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm, 1, + &outputs_lm, 1, true, false); + bm_thread_sync(bm_handle); + + int token = 0; + bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm.device_mem); + return token; +} + +void Baichuan2::chat() { + while (true) { + std::cout << "\nQuestion: "; + std::string input_str; + std::getline(std::cin, input_str); + std::string user_token = ""; //user token id 195 + std::string assitant_token = ""; //assistant token id 196 + if (input_str == "exit") { + break; + } + if (input_str == "clear") { + history.clear(); + continue; + } + + input_str = user_token + input_str + assitant_token; + + std::cout << "\nAnswer: " << std::flush; + answer(input_str); + std::cout << std::endl; + } +} + +void Baichuan2::answer(const std::string &input_str) { + int tok_num = 0; + history.emplace_back(std::move(input_str)); + + std::vector tokens; + + std::string history_input = std::accumulate(history.begin(), history.end(), std::string()); + sentencepiece.Encode(history_input, &tokens); + + if (tokens.empty()) { + printf("Sorry: your question is too wierd!!\n"); + history.clear(); + round = 0; + return; + } + // make sure token not too large + if (tokens.size() > MAX_LEN - 10) { + // reset + if (round == 0) { + printf("Error: your question is too large!\n"); + return; + } + round = 0; + history.clear(); + answer(input_str); + return; + } + auto time_1 = std::chrono::system_clock::now(); + int pre_token = 0; + int token = forward_first(tokens); + auto time_2 = std::chrono::system_clock::now(); + std::string result; + while (token != EOS && token_length < MAX_LEN) { + std::string pre_word; + std::string word; + std::vector pre_ids = {pre_token}; + std::vector ids = {pre_token, token}; + sentencepiece.Decode(pre_ids, &pre_word); + sentencepiece.Decode(ids, &word); + std::string diff = word.substr(pre_word.size()); + result += diff; + std::cout << diff << std::flush; + if (token_length < MAX_LEN) { + token_length++; + } + tok_num++; + token = forward_next(); + } + auto time_3 = std::chrono::system_clock::now(); + auto ftl_dur = + std::chrono::duration_cast(time_2 - time_1); + auto tps_dur = + std::chrono::duration_cast(time_3 - time_2); + double tps = tok_num / (tps_dur.count() * 1e-6); + if (token_length >= MAX_LEN) { + printf(" ......\nWarning: cleanup early history\n"); + } + // double tht = tokens.size() / (tht_dur.count() * 1e-6); + printf("\nFTL:%f s, TPS: %f tokens/s\n", ftl_dur.count() * 1e-6, tps); + history.emplace_back(result); + if (token_length + 128 >= MAX_LEN) { + int num = (history.size() + 3) / 4 * 2; + history.erase(history.begin(), history.begin() + num); + } +} + +static void split(const std::string &s, const std::string &delim, + std::vector &ret) { + size_t last = 0; + size_t index = s.find_first_of(delim, last); + while (index != std::string::npos) { + ret.push_back(s.substr(last, index - last)); + last = index + 1; + index = s.find_first_of(delim, last); + } + if (last < s.length()) { + ret.push_back(s.substr(last)); + } +} + +static std::vector parseCascadeDevices(const std::string &str) { + std::vector devices; + std::vector sub_str; + split(str, ",", sub_str); + for (auto &s : sub_str) { + devices.push_back(std::atoi(s.c_str())); + } + return devices; +} + +void processArguments(int argc, char *argv[], std::string &baichuan_model, + std::vector &devices) { + struct option longOptions[] = {{"model", required_argument, nullptr, 'm'}, + {"dev_id", required_argument, nullptr, 'd'}, + {nullptr, 0, nullptr, 0}}; + + int optionIndex = 0; + int option; + + while ((option = getopt_long(argc, argv, "m:d:", longOptions, + &optionIndex)) != -1) { + switch (option) { + case 'm': + baichuan_model = optarg; + break; + case 'd': + devices = parseCascadeDevices(optarg); + break; + case '?': + exit(EXIT_FAILURE); + default: + exit(EXIT_FAILURE); + } + } +} + +int main(int argc, char **argv) { + // set your bmodel path here + printf("Demo for Baichuan2-7B in BM1684X\n"); + std::string baichuan_model = "baichuan2-7b-test.bmodel"; + std::vector devices = {0}; + processArguments(argc, argv, baichuan_model, devices); + + Baichuan2 baichuan; + printf("Init Environment ...\n"); + baichuan.init(devices, baichuan_model); + printf("==========================\n"); + baichuan.chat(); + baichuan.deinit(); + return 0; +} diff --git a/models/Baichuan2/requirements.txt b/models/Baichuan2/requirements.txt new file mode 100755 index 0000000..4708ef2 --- /dev/null +++ b/models/Baichuan2/requirements.txt @@ -0,0 +1,7 @@ +torch==2.1.2 +transformers==4.36.2 +sentencepiece==0.1.99 +gradio==3.39.0 +mdtex2html==1.2.0 +accelerate +onnx diff --git a/models/Baichuan2/src/include/bmdef.h b/models/Baichuan2/src/include/bmdef.h new file mode 100644 index 0000000..d41a4b0 --- /dev/null +++ b/models/Baichuan2/src/include/bmdef.h @@ -0,0 +1,129 @@ +/***************************************************************************** + * + * Copyright (c) 2016-2026 by Sophgo Technologies Inc. All rights reserved. + * + * The material in this file is confidential and contains trade secrets + * of Sophgo Technologies Inc. This is proprietary information owned by + * Sophgo Technologies Inc. No part of this work may be disclosed, + * reproduced, copied, transmitted, or used in any way for any purpose, + * without the express written permission of Sophgo Technologies Inc. + * + *****************************************************************************/ + +#ifndef __BMRUNTIME_DEFINE_H__ +#define __BMRUNTIME_DEFINE_H__ + +#include "bmlib_runtime.h" +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +/* --------------------------------------------------------------------------*/ +/* basic definitions */ + +/* bm_data_type_t holds the type for a scalar value */ +typedef enum bm_data_type_e { + BM_FLOAT32 = 0, + BM_FLOAT16 = 1, + BM_INT8 = 2, + BM_UINT8 = 3, + BM_INT16 = 4, + BM_UINT16 = 5, + BM_INT32 = 6, + BM_UINT32 = 7, + BM_BFLOAT16 = 8, + BM_INT4 = 9, + BM_UINT4 = 10, +} bm_data_type_t; + +/* store mode definitions */ +typedef enum bm_store_mode_e { + BM_STORE_1N = 0, /* default, if not sure, use 0 */ + BM_STORE_2N = 1, + BM_STORE_4N = 2, +} bm_store_mode_t; + +/* bm_shape_t holds the shape info */ +#define BM_MAX_DIMS_NUM 8 +typedef struct bm_shape_s { + int num_dims; + int dims[BM_MAX_DIMS_NUM]; +} bm_shape_t; + +typedef struct bm_shape_ex_s { + bm_shape_t shape; + int elem_num; +} bm_shape_ex_t; + +/* +bm_tensor_t holds a multi-dimensional array of elements of a single data type +and tensor are in device memory */ +typedef struct bm_tensor_s { + bm_data_type_t dtype; + bm_shape_t shape; + bm_device_mem_t device_mem; + bm_store_mode_t st_mode; /* user can set 0 as default store mode */ +} bm_tensor_t; + +/* --------------------------------------------------------------------------*/ +/* network information structure */ + +/* bm_stage_info_t holds input/output shapes and device mems; every network can contain one or more + * stages */ +typedef struct bm_stage_info_s { + bm_shape_t *input_shapes; /* input_shapes[0] / [1] / ... / [input_num-1] */ + bm_shape_t *output_shapes; /* output_shapes[0] / [1] / ... / [output_num-1] */ + bm_device_mem_t *input_mems; /* input_mems[0] / [1] / ... / [input_num-1] */ + bm_device_mem_t *output_mems; /* output_mems[0] / [1] / ... / [output_num-1] */ +} bm_stage_info_t; + +/* bm_tensor_info_t holds all information of one net. + * scale for float type is 1.0 as default */ +typedef struct bm_net_info_s { + const char* name; /* net name */ + bool is_dynamic; /* dynamic or static */ + int input_num; /* number of inputs */ + char const** input_names; /* input_names[0] / [1] / .../ [input_num-1] */ + bm_data_type_t* input_dtypes; /* input_dtypes[0] / [1] / .../ [input_num-1] */ + float* input_scales; /* input_scales[0] / [1] / .../ [input_num-1] */ + int output_num; /* number of outputs */ + char const** output_names; /* output_names[0] / [1] / .../ [output_num-1] */ + bm_data_type_t* output_dtypes; /* output_dtypes[0] / [1] / .../ [output_num-1] */ + float* output_scales; /* output_scales[0] / [1] / .../ [output_num-1] */ + int stage_num; /* number of stages */ + bm_stage_info_t* stages; /* stages[0] / [1] / ... / [stage_num-1] */ + size_t* max_input_bytes; /* max_input_bytes[0]/ [1] / ... / [input_num-1] */ + size_t* max_output_bytes; /* max_output_bytes[0] / [1] / ... / [output_num-1] */ + int* input_zero_point; /* input_zero_point[0] / [1] / .../ [input_num-1] */ + int* output_zero_point; /* output_zero_point[0] / [1] / .../ [output_num-1] */ + int *input_loc_devices; /* input_loc_device[0] / [1] / .../ [input_num-1] */ + int *output_loc_devices; /* output_loc_device[0] / [1] / .../ [output_num-1] */ +} bm_net_info_t; + +typedef struct api_info_s { + /// @brief api_id to be sent to driver + int32_t api_id; + /// @brief api data to be sent to driver + uint8_t **api_data; + /// @brief size of the api data to be sent to driver + size_t api_data_size; + /// @brief subsize of the api data to be sent to driver + size_t *api_data_subsize; + /// @brief offset of input tensors' addr in api_data + uint32_t *input_addr_offset; + /// @brief number of the offset of input tensors' addr in api_data + size_t input_addr_offset_number; + /// @brief offset of output tensors' addr in api_data + uint32_t *output_addr_offset; + /// @brief number of the offset of output tensors' addr in api_data + size_t output_addr_offset_number; +} api_info_c; + +#if defined(__cplusplus) +} +#endif + +#endif /* __BM_NET_H__ */ diff --git a/models/Baichuan2/src/include/bmlib_runtime.h b/models/Baichuan2/src/include/bmlib_runtime.h new file mode 100644 index 0000000..071cfe0 --- /dev/null +++ b/models/Baichuan2/src/include/bmlib_runtime.h @@ -0,0 +1,2581 @@ +/***************************************************************************** + * + * Copyright (c) 2016-2026 by Bitmain Technologies Inc. All rights reserved. + * + * The material in this file is confidential and contains trade secrets + * of Bitmain Technologies Inc. This is proprietary information owned by + * Bitmain Technologies Inc. No part of this work may be disclosed, + * reproduced, copied, transmitted, or used in any way for any purpose, + * without the express written permission of Bitmain Technologies Inc. + * + *****************************************************************************/ + +/************************************************************************** + * bmlib_runtime defines interfaces that operate TPU devices. + * The functions can be divided into serveral categories. + * 1) device handle creation and destroy + * 2) memory help functions + * 3) global memory allocation and free + * 4) data transfer between host and device + * 5) data transfer within device memory + * 6) api send and synchronization + * 7) global memory map and coherence + * 8) trace and profile + * 9) power management + * 10) miscellaneous functions + *************************************************************************/ + +#ifndef BMLIB_RUNTIME_H_ +#define BMLIB_RUNTIME_H_ +#if defined(_WIN32) && !defined(__MINGW32__) + #include + #define DECL_EXPORT __declspec(dllexport) + #define DECL_IMPORT __declspec(dllimport) +#else + #include + #include + #include + #define DECL_EXPORT + #define DECL_IMPORT +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +typedef enum { + MODULE_CDMA = 0, + MODULE_GDMA = 1, + MODULE_TPU = 2, + MODULE_SMMU = 3, + MODULE_SRAM = 4, + MODULE_END = 5 +} MODULE_ID; + +#define BM_MEM_ADDR_NULL (0xfffffffff) + +#ifndef BM_MEM_DESC_T_ +#define BM_MEM_DESC_T_ +/* BM function return code definitions */ +typedef enum { + BM_SUCCESS = 0, + BM_ERR_DEVNOTREADY = 1, /* Device not ready yet */ + BM_ERR_FAILURE = 2, /* General failure */ + BM_ERR_TIMEOUT = 3, /* Timeout */ + BM_ERR_PARAM = 4, /* Parameters invalid */ + BM_ERR_NOMEM = 5, /* Not enough memory */ + BM_ERR_DATA = 6, /* Data error */ + BM_ERR_BUSY = 7, /* Busy */ + BM_ERR_NOFEATURE = 8, /* Not supported yet */ + BM_NOT_SUPPORTED = 9 +} bm_status_t; + +/* BM memory type definitions */ +typedef enum { + BM_MEM_TYPE_DEVICE = 0, + BM_MEM_TYPE_HOST = 1, + BM_MEM_TYPE_SYSTEM = 2, + BM_MEM_TYPE_INT8_DEVICE = 3, + BM_MEM_TYPE_INVALID = 4 +} bm_mem_type_t; + +typedef enum { + PERF_MONITOR_GDMA = 0, + PERF_MONITOR_TPU = 1 +} PERF_MONITOR_ID; + +typedef enum { + BMCPU_IDLE = 0, + BMCPU_RUNNING = 1, + BMCPU_FAULT = 2 +} bm_cpu_status_t; + +/* +* bm performace monitor +*/ +typedef struct bm_perf_monitor { + long long buffer_start_addr; /*buffer address to store perf data*/ + int buffer_size; /*buffer size*/ + PERF_MONITOR_ID monitor_id; /*PERF_MONITOR_GDMA or PERF_MONITOR_TPU*/ +} bm_perf_monitor_t; + +typedef union { + struct { + bm_mem_type_t mem_type : 3; + unsigned int gmem_heapid : 3; + unsigned int reserved : 26; + } u; + unsigned int rawflags; +} bm_mem_flags_t; + +/* BM memory descriptor definition*/ +typedef struct bm_mem_desc { + union { + struct { +#ifdef __linux__ + unsigned long device_addr; +#else + unsigned long long device_addr; +#endif + unsigned int reserved; + int dmabuf_fd; + } device; + + struct { + void *system_addr; + unsigned int reserved0; + int reserved1; + } system; + } u; + + bm_mem_flags_t flags; + unsigned int size; +} bm_mem_desc_t; + +typedef struct bm_mem_desc bm_device_mem_t; +typedef struct bm_mem_desc bm_system_mem_t; + +typedef struct sg_mem_desc { + union { + struct { +#ifdef __linux__ + unsigned long device_addr; +#else + unsigned long long device_addr; +#endif + unsigned int reserved; + int dmabuf_fd; + } device; + + struct { + void *system_addr; + unsigned int reserved0; + int reserved1; + } system; + } u; + + bm_mem_flags_t flags; + unsigned long long size; +} sg_mem_desc_t; + +typedef struct sg_mem_desc sg_device_mem_t; +typedef struct sg_mem_desc sg_system_mem_t; +#endif + +struct bm_context; +typedef struct bm_context *bm_handle_t; + +#define MD5SUM_LEN 16 +#define LIB_MAX_NAME_LEN 64 +#define FUNC_MAX_NAME_LEN 64 + +typedef struct bm_module +{ + // void *lib_handle; + char lib_name[LIB_MAX_NAME_LEN]; + unsigned char md5[MD5SUM_LEN]; +}bm_module; + +typedef struct bm_module *tpu_kernel_module_t; +typedef int tpu_kernel_function_t; + +/** + * @name tpu_kernel_load_module_file + * @brief To load dyn file + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] module_file dyn file + * @retval dyn lib ptr + */ +tpu_kernel_module_t tpu_kernel_load_module_file(bm_handle_t handle, const char *module_file); + +/** + * @name tpu_kernel_load_module_file_key + * @brief To load dyn file with key + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] module_file dyn file + * @param [in] key identification str + * @param [in] size key size + * @retval dyn lib ptr + */ +tpu_kernel_module_t tpu_kernel_load_module_file_key(bm_handle_t handle, const char *module_file, const char *key, int size); + +/** + * @name tpu_kernel_unload_module + * @brief To unload dyn file + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] p_module dyn lib ptr + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +bm_status_t tpu_kernel_unload_module(bm_handle_t handle, tpu_kernel_module_t p_module); + +/** + * @name tpu_kernel_free_module + * @brief To free p_module when not use + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] p_module dyn lib ptr + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +bm_status_t tpu_kernel_free_module(bm_handle_t handle, tpu_kernel_module_t p_module); + +/** + * @name tpu_kernel_load_module + * @brief To load dyn module + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] data dyn module + * @param [in] length dyn module size + * @retval dyn lib ptr + */ +tpu_kernel_module_t tpu_kernel_load_module(bm_handle_t handle, const char *data, size_t length); + +/** + * @name tpu_kernel_get_function + * @brief To get function from lib + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] module dyn module + * @param [in] function funtion name + * @retval function id + */ +tpu_kernel_function_t tpu_kernel_get_function(bm_handle_t handle, tpu_kernel_module_t module, const char *function); + +/** + * @name tpu_kernel_launch + * @brief To launch function with sync + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] function function id + * @param [in] args funtion args + * @param [in] size args size + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +bm_status_t tpu_kernel_launch(bm_handle_t handle, tpu_kernel_function_t function, void *args, size_t size); + +/** + * @name tpu_kernel_launch_async + * @brief To launch function with async + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] function function id + * @param [in] args funtion args + * @param [in] size args size + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +bm_status_t tpu_kernel_launch_async(bm_handle_t handle, tpu_kernel_function_t function, void *args, size_t size); + +/** + * @name tpu_kernel_launch_async_multi_cores + * @brief To launch function with async for multi cores + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] func_name function name + * @param [in] api_param funtion params + * @param [in] api_size params size + * @param [in] core_list list of core ids + * @param [in] core_num number of cores + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +bm_status_t tpu_kernel_launch_async_multi_cores(bm_handle_t handle, const char *func_name, const void *api_param, + size_t api_size, const int* core_list, const int core_num); + +/** + * @name tpu_kernel_launch_sync_multi_cores + * @brief To launch function with sync for multi cores + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] func_name function name + * @param [in] api_param funtion params + * @param [in] api_size params size + * @param [in] core_list list of core ids + * @param [in] core_num number of cores + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +bm_status_t tpu_kernel_launch_sync_multi_cores(bm_handle_t handle, const char *func_name, const void *api_param, + size_t api_size, const int* core_list, const int core_num); + +/** + * @name tpu_kernel_sync + * @brief To sync + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +bm_status_t tpu_kernel_sync(bm_handle_t handle); +void show_md5(unsigned char md5[]); + +DECL_EXPORT void bmlib_log(const char *tag, int level, const char *fmt, ...); + +#ifndef USING_CMODEL +#define BM_CHECK_RET(call) \ + do { \ + bm_status_t ret = (bm_status_t)call; \ + if (ret != BM_SUCCESS) { \ + bmlib_log("BM_CHECK",16,"BM_CHECK_RET fail %s: %s: %d\n", __FILE__, __func__, __LINE__); \ + return ret; \ + } \ + } while (0) +#else +#define BM_CHECK_RET(call) \ + do { \ + bm_status_t ret = call; \ + if (ret != BM_SUCCESS) { \ + bmlib_log("BM_CHECK",16,"BM_CHECK_RET failed %d\n", ret);\ + ASSERT(0); \ + exit(-ret); \ + } \ + } while (0) +#endif + +/*******************handle releated functions *********************************/ +/** + * @name bm_dev_getcount + * @brief To get the number of sophon devices in system. + * If N is got, valid devid is [0, N-1] + * @ingroup bmlib_runtime + * + * @param [out] count The result number of sophon devices + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_dev_getcount(int *count); + +/** + * @name bm_dev_query + * @brief To query if a device is present + * @ingroup bmlib_runtime + * + * @param [in] devid The id of the device to query + * @retval BM_SUCCESS Device is present + * Other code Devcie is not present + */ +DECL_EXPORT bm_status_t bm_dev_query(int devid); + +/** + * @name bm_dev_request + * @brief To create a handle for the given device + * @ingroup bmlib_runtime + * + * @param [out] handle The created handle + * @param [in] devid Specify on which device to create handle + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_dev_request(bm_handle_t *handle, int devid); + +/** + * @name bm_get_devid + * @brief To get device index for the given handle + * @ingroup bmlib_runtime + * + * @param [in] handle The given handle + * @retval int device index that the handle points to. + */ +DECL_EXPORT int bm_get_devid(bm_handle_t handle); + +/** + * @name bm_dev_free + * @brief To free a handle + * @ingroup bmlib_runtime + * + * @param [in] handle The handle to free + */ +DECL_EXPORT void bm_dev_free(bm_handle_t handle); + +/*******************memory help functions ************************************/ +/** + * @name bm_mem_get_type + * @brief To get a memory descriptor's type + * @ingroup bmlib_runtime + * + * @param [in] mem The memory descriptor queried + * @retval BM_MEM_TYPE_DEVICE Device global memory + * @retval BM_MEM_TYPE_SYSTEM Host user memory + */ +DECL_EXPORT bm_mem_type_t bm_mem_get_type(struct bm_mem_desc mem); + +/** + * @name sg_mem_get_type + * @brief To get a memory descriptor's type + * @ingroup bmlib_runtime + * + * @param [in] mem The memory descriptor queried + * @retval BM_MEM_TYPE_DEVICE Device global memory + * @retval BM_MEM_TYPE_SYSTEM Host user memory + */ +DECL_EXPORT bm_mem_type_t sg_mem_get_type(struct sg_mem_desc mem); + +/** + * @name bm_mem_get_device_addr + * @brief To get a device memory descriptor's address + * @ingroup bmlib_runtime + * + * @param [in] mem The device memory descriptor queried + * @retval unsigned long long The device memory address + */ +DECL_EXPORT unsigned long long bm_mem_get_device_addr(struct bm_mem_desc mem); + +/** + * @name sg_mem_get_device_addr + * @brief To get a device memory descriptor's address + * @ingroup bmlib_runtime + * + * @param [in] mem The device memory descriptor queried + * @retval unsigned long long The device memory address + */ +DECL_EXPORT unsigned long long sg_mem_get_device_addr(struct sg_mem_desc mem); + +/** + * @name bm_mem_set_device_addr + * @brief To set a device memory descriptor's address + * @ingroup bmlib_runtime + * + * @param [in] pmem The device memory descriptor pointer + * @param ]in] addr The new device address of the device memory + */ +DECL_EXPORT void bm_mem_set_device_addr(struct bm_mem_desc* pmem, unsigned long long addr); + +/** + * @name sg_mem_set_device_addr + * @brief To set a device memory descriptor's address + * @ingroup bmlib_runtime + * + * @param [in] pmem The device memory descriptor pointer + * @param ]in] addr The new device address of the device memory + */ +DECL_EXPORT void sg_mem_set_device_addr(struct sg_mem_desc* pmem, unsigned long long addr); + +/** + * @name bm_mem_get_device_size + * @brief To get a device memory descriptor's size + * @ingroup bmlib_runtime + * + * @param [in] mem The device memory descriptor queried + * @retval unsigned int The device memory's size in bytes + */ +DECL_EXPORT unsigned int bm_mem_get_device_size(struct bm_mem_desc mem); + +/** + * @name sg_mem_get_device_size + * @brief To get a device memory descriptor's size + * @ingroup bmlib_runtime + * + * @param [in] mem The device memory descriptor queried + * @retval unsigned int The device memory's size in bytes + */ +DECL_EXPORT unsigned long long sg_mem_get_device_size(struct sg_mem_desc mem); + +/** + * @name bm_mem_set_device_size + * @brief To set a device memory descriptor's size + * @ingroup bmlib_runtime + * + * @param [out] pmem The device memory descriptor pointer + * @param [in] size The new device memory size (in bytes) of the device memory + */ +DECL_EXPORT void bm_mem_set_device_size(struct bm_mem_desc* pmem, unsigned int size); + +/** + * @name sg_mem_set_device_size + * @brief To set a device memory descriptor's size + * @ingroup bmlib_runtime + * + * @param [out] pmem The device memory descriptor pointer + * @param [in] size The new device memory size (in bytes) of the device memory + */ +DECL_EXPORT void sg_mem_set_device_size(struct sg_mem_desc* pmem, unsigned long long size); + +/** + * @name bm_set_device_mem + * @brief To fill in a device memory descriptor with size and address + * @ingroup bmlib_runtime + * + * @param [in] pmem The device memory descriptor pointer + * @param [in] size The device memory descriptor's size + * @param [in] addr The device memory descriptor's address + */ +DECL_EXPORT void bm_set_device_mem(bm_device_mem_t* pmem, unsigned int size, + unsigned long long addr); + +/** + * @name sg_set_device_mem + * @brief To fill in a device memory descriptor with size and address + * @ingroup bmlib_runtime + * + * @param [in] pmem The device memory descriptor pointer + * @param [in] size The device memory descriptor's size + * @param [in] addr The device memory descriptor's address + */ +DECL_EXPORT void sg_set_device_mem(sg_device_mem_t* pmem, unsigned long long size, + unsigned long long addr); + +/** + * @name bm_mem_from_device + * @brief To create a device memory descriptor from address and size + * @ingroup bmlib_runtime + * + * @param [in] device_addr The device memory address + * @param [in] len The device memory size + * @retval bm_device_mem_t The device memory descriptor created + */ +DECL_EXPORT bm_device_mem_t bm_mem_from_device(unsigned long long device_addr, + unsigned int len); + +/** + * @name sg_mem_from_device + * @brief To create a device memory descriptor from address and size + * @ingroup bmlib_runtime + * + * @param [in] device_addr The device memory address + * @param [in] len The device memory size + * @retval bm_device_mem_t The device memory descriptor created + */ +DECL_EXPORT sg_device_mem_t sg_mem_from_device(unsigned long long device_addr, + unsigned long long len); + +/** + * @name bm_mem_get_system_addr + * @brief To get a system memory descriptor's address + * @ingroup bmlib_runtime + * + * @param [in] mem The system memory descriptor + * @retval void * The system memory descriptor's address + */ +DECL_EXPORT void *bm_mem_get_system_addr(struct bm_mem_desc mem); + +/** + * @name sg_mem_get_system_addr + * @brief To get a system memory descriptor's address + * @ingroup bmlib_runtime + * + * @param [in] mem The system memory descriptor + * @retval void * The system memory descriptor's address + */ +DECL_EXPORT void *sg_mem_get_system_addr(struct sg_mem_desc mem); + +/** + * @name bm_mem_set_system_addr + * @brief To set a system memory descriptor's address + * @ingroup bmlib_runtime + * + * @param [in] pmem The system memory descriptor pointer + * @param [in] addr The system memory address + */ +DECL_EXPORT void bm_mem_set_system_addr(struct bm_mem_desc* pmem, void *addr); + +/** + * @name sg_mem_set_system_addr + * @brief To set a system memory descriptor's address + * @ingroup bmlib_runtime + * + * @param [in] pmem The system memory descriptor pointer + * @param [in] addr The system memory address + */ +DECL_EXPORT void sg_mem_set_system_addr(struct sg_mem_desc* pmem, void *addr); + +/** + * @name bm_mem_from_system + * @brief To create a system memory descriptor with the given system address + * @ingroup bmlib_runtime + * + * @param [in] system_addr The system address in the descriptor + * @retval bm_system_mem_t The system memory descriptor created + */ +DECL_EXPORT bm_system_mem_t bm_mem_from_system(void *system_addr); + +/*******************memory alloc and free functions ***************************/ +/** + * @name bm_mem_null + * @brief Return an illegal device memory descriptor + * @ingroup bmlib_runtime + * + * @retval bm_device_mem_t An invalid device memory descriptor + */ +DECL_EXPORT bm_device_mem_t bm_mem_null(void); +#define BM_MEM_NULL (bm_mem_null()) + +/** + * @name bm_malloc_neuron_device + * @brief To malloc device memory according to a tensor shape + * (each neuron is 32 bits) + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] pmem The result devcie memory descriptor + * @param [in] n, c, h, w The shape of the input tensor + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_malloc_neuron_device(bm_handle_t handle, bm_device_mem_t *pmem, + int n, int c, int h, int w); + +/** + * @name sg_malloc_neuron_device + * @brief To malloc device memory according to a tensor shape + * (each neuron is 32 bits) + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] pmem The result devcie memory descriptor + * @param [in] n, c, h, w The shape of the input tensor + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_malloc_neuron_device(bm_handle_t handle, sg_device_mem_t *pmem, + unsigned long long n, unsigned long long c, + unsigned long long h, unsigned long long w); + +/** + * @name bm_malloc_device_dword + * @brief To malloc device memory in size of dword (32 bits) + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] pmem The result device memory descriptor + * @param [in] count The number of dwords(32bits) to allocate + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_malloc_device_dword(bm_handle_t handle, bm_device_mem_t *pmem, + int count); + +/** + * @name sg_malloc_device_dword + * @brief To malloc device memory in size of dword (32 bits) + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] pmem The result device memory descriptor + * @param [in] count The number of dwords(32bits) to allocate + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_malloc_device_dword(bm_handle_t handle, sg_device_mem_t *pmem, + unsigned long long count); + +/** + * @name bm_malloc_device_byte + * @brief To malloc device memory in size of byte + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] pmem The result device memory descriptor + * @param [in] size The number of bytes to allocate + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_malloc_device_byte(bm_handle_t handle, bm_device_mem_t *pmem, + unsigned int size); + +/** + * @name sg_malloc_device_byte + * @brief To malloc device memory in size of byte + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] pmem The result device memory descriptor + * @param [in] size The number of bytes to allocate + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_malloc_device_byte(bm_handle_t handle, sg_device_mem_t *pmem, + unsigned long long size); + +/** + * @name bm_malloc_device_byte_heap + * @brief To malloc device memory in size of byte within the specified heap + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] pmem The result device memory descriptor + * @param [in] heap_id The heap where to allocate 0/1/2 + * @param [in] size The number of bytes to allocate + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_malloc_device_byte_heap(bm_handle_t handle, bm_device_mem_t *pmem, + int heap_id, unsigned int size); + +/** + * @name sg_malloc_device_byte_heap + * @brief To malloc device memory in size of byte within the specified heap + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] pmem The result device memory descriptor + * @param [in] heap_id The heap where to allocate 0/1/2 + * @param [in] size The number of bytes to allocate + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_malloc_device_byte_heap(bm_handle_t handle, sg_device_mem_t *pmem, + int heap_id, unsigned long long size); + +/** + * @name bm_malloc_device_byte_heap_mask + * @brief To malloc device memory in size of byte within the specified heaps + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] pmem The result device memory descriptor + * @param [in] heap_id_mask The mask which heaps allocate from. each bit indicate one heap + * @param [in] size The number of bytes to allocate + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_malloc_device_byte_heap_mask(bm_handle_t handle, bm_device_mem_t *pmem, + int heap_id_mask, unsigned int size); + +/** + * @name sg_malloc_device_byte_heap_mask + * @brief To malloc device memory in size of byte within the specified heaps + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] pmem The result device memory descriptor + * @param [in] heap_id_mask The mask which heaps allocate from. each bit indicate one heap + * @param [in] size The number of bytes to allocate + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_malloc_device_byte_heap_mask(bm_handle_t handle, sg_device_mem_t *pmem, + int heap_id_mask, unsigned long long size); + +/** + * @name bm_free_device + * @brief To free device memory + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] mem The device memory descriptor to free + */ +DECL_EXPORT void bm_free_device(bm_handle_t handle, bm_device_mem_t mem); + +/** + * @name sg_free_device + * @brief To free device memory + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] mem The device memory descriptor to free + */ +DECL_EXPORT void sg_free_device(bm_handle_t handle, sg_device_mem_t mem); + +/** + * @name bm_gmem_arm_reserved_request + * @brief To obtain the address of global memory reserved for arm926 + * @param [in] handle The device handle + * + * @retval unsigned long long The absolute address of gmem reserved for arm926 + */ +DECL_EXPORT unsigned long long bm_gmem_arm_reserved_request(bm_handle_t handle); + +/** + * @name bm_gmem_arm_reserved_release + * @brief To release the global memory reserved for arm926 + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + */ +DECL_EXPORT void bm_gmem_arm_reserved_release(bm_handle_t handle); + +/*******************memory copy functions *************************************/ +/** + * @name bm_memcpy_s2d + * @brief To copy data from system memory to device memory + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination memory (device memory descriptor ) + * @param [in] src The source memory (system memory, a void* pointer) + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_memcpy_s2d(bm_handle_t handle, bm_device_mem_t dst, void *src); + +/** + * @name bm_memcpy_p2p + * @brief To copy data from one chip to another chip + * @ingroup bmlib_runtime + * + * @param [in] handle_src The source device handle + * @param [in] src The source memory (device memory descriptor ) + * @param [in] handle_dst The destination device handle + * @param [in] dst The destination memory (device memory descriptor ) + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_memcpy_p2p(bm_handle_t handle_src, bm_device_mem_t src, bm_handle_t handle_dst,bm_device_mem_t dst); + +/** + * @name sg_memcpy_s2d + * @brief To copy data from system memory to device memory + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination memory (device memory descriptor ) + * @param [in] src The source memory (system memory, a void* pointer) + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_memcpy_s2d(bm_handle_t handle, sg_device_mem_t dst, void *src); + +/** + * @name bm_memcpy_s2d_partial_offset + * @brief To copy specified bytes of data from system memory to device memory + * with an offset in device memory address. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination memory (device memory descriptor) + * @param [in] src The source memory (system memory, a void* pointer) + * @param [in] size The size of data to copy (in bytes) + * @param [in] offset The offset of the device memory address + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_memcpy_s2d_partial_offset(bm_handle_t handle, + bm_device_mem_t dst, void *src, + unsigned int size, + unsigned int offset); + +/** + * @name sg_memcpy_s2d_partial_offset + * @brief To copy specified bytes of data from system memory to device memory + * with an offset in device memory address. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination memory (device memory descriptor) + * @param [in] src The source memory (system memory, a void* pointer) + * @param [in] size The size of data to copy (in bytes) + * @param [in] offset The offset of the device memory address + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_memcpy_s2d_partial_offset(bm_handle_t handle, + sg_device_mem_t dst, void *src, + unsigned long long size, + unsigned long long offset); + +/** + * @name bm_memcpy_s2d_partial + * @brief To copy specified bytes of data from system memory to device memory + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination memory (device memory descriptor) + * @param [in] src The source memory (system memory, a void* pointer) + * @param [in] size The size of data to copy (in bytes) + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_memcpy_s2d_partial(bm_handle_t handle, bm_device_mem_t dst, + void *src, unsigned int size); + +/** + * @name sg_memcpy_s2d_partial + * @brief To copy specified bytes of data from system memory to device memory + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination memory (device memory descriptor) + * @param [in] src The source memory (system memory, a void* pointer) + * @param [in] size The size of data to copy (in bytes) + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_memcpy_s2d_partial(bm_handle_t handle, sg_device_mem_t dst, + void *src, unsigned long long size); + +/** + * @name bm_memcpy_d2s + * @brief To copy data from device memory to system memory + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination memory (system memory, a void* pointer) + * @param [in] src The source memory (device memory descriptor) + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_memcpy_d2s(bm_handle_t handle, void *dst, bm_device_mem_t src); + +/** + * @name sg_memcpy_d2s + * @brief To copy data from device memory to system memory + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination memory (system memory, a void* pointer) + * @param [in] src The source memory (device memory descriptor) + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_memcpy_d2s(bm_handle_t handle, void *dst, sg_device_mem_t src); + +/** + * @name bm_memcpy_d2s_partial_offset + * @brief To copy specified bytes of data from device memory to system memory + * with an offset in device memory address. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination memory (system memory, a void* pointer) + * @param [in] src The source memory (device memory descriptor) + * @param [in] size The size of data to copy (in bytes) + * @param [in] offset The offset of the device memory address + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_memcpy_d2s_partial_offset(bm_handle_t handle, void *dst, + bm_device_mem_t src, unsigned int size, + unsigned int offset); + +/** + * @name sg_memcpy_d2s_partial_offset + * @brief To copy specified bytes of data from device memory to system memory + * with an offset in device memory address. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination memory (system memory, a void* pointer) + * @param [in] src The source memory (device memory descriptor) + * @param [in] size The size of data to copy (in bytes) + * @param [in] offset The offset of the device memory address + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_memcpy_d2s_partial_offset(bm_handle_t handle, void *dst, + sg_device_mem_t src, unsigned long long size, + unsigned long long offset); + +/** + * @name bm_memcpy_d2s_partial + * @brief To copy specified bytes of data from device memory to system memory + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination memory (system memory, a void* pointer) + * @param [in] src The source memory (device memory descriptor) + * @param [in] size The size of data to copy (in bytes) + * + * @retval BM_SUCCESS Data transfer succeeds. + * Other code Data transfer fails. + */ +DECL_EXPORT bm_status_t bm_memcpy_d2s_partial(bm_handle_t handle, void *dst, + bm_device_mem_t src, unsigned int size); + +/** + * @name sg_memcpy_d2s_partial + * @brief To copy specified bytes of data from device memory to system memory + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination memory (system memory, a void* pointer) + * @param [in] src The source memory (device memory descriptor) + * @param [in] size The size of data to copy (in bytes) + * + * @retval BM_SUCCESS Data transfer succeeds. + * Other code Data transfer fails. + */ +DECL_EXPORT bm_status_t sg_memcpy_d2s_partial(bm_handle_t handle, void *dst, + sg_device_mem_t src, unsigned long long size); + +/** + * @name bm_memcpy_d2d + * @brief To copy specified dwords of data from one piece of device memory + * to another piece of device memory within one device. Both source + * and destination offsets can be specified. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination device memory + * @param [in] dst_offset The offset of destination device memory address + * @param [in] src The source device memory + * @param [in] src_offset The offset of source device memory address + * @param [in] len Length of data to copy (in DWORD 4 bytes) + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_memcpy_d2d(bm_handle_t handle, bm_device_mem_t dst, + int dst_offset, bm_device_mem_t src, int src_offset, + int len); + +/** + * @name bm_memcpy_d2d_with_core + * @brief To copy specified dwords of data from one piece of device memory + * to another piece of device memory within one device. Both source + * and destination offsets can be specified. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination device memory + * @param [in] dst_offset The offset of destination device memory address + * @param [in] src The source device memory + * @param [in] src_offset The offset of source device memory address + * @param [in] len Length of data to copy (in DWORD 4 bytes) + * @param [in] core_id The core id to copy + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_memcpy_d2d_with_core(bm_handle_t handle, bm_device_mem_t dst, + int dst_offset, bm_device_mem_t src, int src_offset, + int len, int core_id); + +/** + * @name bm_memcpy_d2d_byte + * @brief To copy specified bytes of data from one piece of device memory + * to another piece of device memory within one device. Both source + * and destination offsets can be specified. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination device memory + * @param [in] dst_offset The offset of destination device memory address (in bytes) + * @param [in] src The source device memory + * @param [in] src_offset The offset of source device memory address (in bytes) + * @param [in] size Size of data to copy (in bytes) + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_memcpy_d2d_byte(bm_handle_t handle, bm_device_mem_t dst, + size_t dst_offset, bm_device_mem_t src, + size_t src_offset, size_t size); + +/** + * @name bm_memcpy_d2d_byte_with_core + * @brief To copy specified bytes of data from one piece of device memory + * to another piece of device memory within one device. Both source + * and destination offsets can be specified. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination device memory + * @param [in] dst_offset The offset of destination device memory address (in bytes) + * @param [in] src The source device memory + * @param [in] src_offset The offset of source device memory address (in bytes) + * @param [in] size Size of data to copy (in bytes) + * @param [in] core_id The core id to copy + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_memcpy_d2d_byte_with_core(bm_handle_t handle, bm_device_mem_t dst, + size_t dst_offset, bm_device_mem_t src, + size_t src_offset, size_t size, int core_id); + +/** + * @name bm_memcpy_d2d_stride + * @brief To copy specified data from one piece of device memory + * to another piece of device memory within one device. Both source + * and destination offsets can be specified. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination device memory + * @param [in] dst_stride The data stride of destination data + * @param [in] src The source device memory + * @param [in] src_stride The data stride of source data + * @param [in] count Count of data to copy + * @param [in] format_size Data format byte size, such as sizeof(uint8_t), sizeof(float), etc. + * format_size only support 1/2/4. + * + * dst_stride MUST be 1, EXCEPT: dst_stride == 4 && src_stride == 1 && format_size ==1 + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_memcpy_d2d_stride(bm_handle_t handle, + bm_device_mem_t dst, + int dst_stride, + bm_device_mem_t src, + int src_stride, + int count, + int format_size); + +/** + * @name bm_memcpy_d2d_stride + * @brief To copy specified data from one piece of device memory + * to another piece of device memory within one device. Both source + * and destination offsets can be specified. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dst The destination device memory + * @param [in] dst_stride The data stride of destination data + * @param [in] src The source device memory + * @param [in] src_stride The data stride of source data + * @param [in] count Count of data to copy + * @param [in] format_size Data format byte size, such as sizeof(uint8_t), sizeof(float), etc. + * format_size only support 1/2/4. + * @param [in] core_id The core id to copy. + * + * dst_stride MUST be 1, EXCEPT: dst_stride == 4 && src_stride == 1 && format_size ==1 + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_memcpy_d2d_stride_with_core(bm_handle_t handle, + bm_device_mem_t dst, + int dst_stride, + bm_device_mem_t src, + int src_stride, + int count, + int format_size, + int core_id); + +/** + * @name bm_memcpy_c2c + * @brief To copy data from one chip to another chip. + * (Used in multi-chip card scenario) + * @ingroup bmlib_runtime + * + * @param [in] src_handle The source device handle + * @param [in] dst_handle The destination device handle + * @param [in] src The source device memory descriptor + * @param [in] dst The destination device memory descriptor + * @param [in] force_dst_cdma If use the CDMA engine of the destination device + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_memcpy_c2c(bm_handle_t src_handle, bm_handle_t dst_handle, + bm_device_mem_t src, bm_device_mem_t dst, + bool force_dst_cdma); + +/** + * @name bm_memset_device + * @brief To fill in specified device memory with the given value + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] value The value used to fill. (int type) + * @param [in] mem The device memory which will be filled in + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_memset_device(bm_handle_t handle, const int value, + bm_device_mem_t mem); + +/** + * @name bm_memset_device_ext + * @brief To fill in specified device memory with the given value and mode + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] value The pointer of value used to fill + * @param [in] mode The valid bytes of *value + * @param [in] mem The device memory which will be filled in + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_memset_device_ext(bm_handle_t handle, void* value, int mode, + bm_device_mem_t mem); + +/** + * @name bm_mem_convert_system_to_device_neuron + * @brief To malloc a piece of device memory according to the shape of + * neuron(in DWORD 4 bytes); copy neuron from system memory to + * device memory if need_copy is true. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dev_mem The device memory descriptor + * @param [in] sys_mem The system memory descriptor + * @param [in] need_copy If copy from system to device is needed + * @param [in] n,c,h,w Neuron shape size + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_neuron(bm_handle_t handle, + struct bm_mem_desc *dev_mem, + struct bm_mem_desc sys_mem, + bool need_copy, int n, int c, + int h, int w); + +/** + * @name bm_mem_convert_system_to_device_neuron_byte + * @brief To malloc a piece of device memory according to the shape of + * neuron(in bytes); copy neuron from system memory to + * device memory if need_copy is true. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dev_mem The device memory descriptor + * @param [in] sys_mem The system memory descriptor + * @param [in] need_copy If copy from system to device is needed + * @param [in] n,c,h,w Neuron shape size + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_neuron_byte( + bm_handle_t handle, struct bm_mem_desc *dev_mem, struct bm_mem_desc sys_mem, + bool need_copy, int n, int c, int h, int w); + +/** + * @name bm_mem_convert_system_to_device_coeff + * @brief To malloc a piece of device memory according to the size of + * coefficient (in DWORD 4 bytes); copy coefficient from system + * memory to device memory if need_copy is true. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dev_mem The device memory descriptor + * @param [in] sys_mem The system memory descriptor + * @param [in] need_copy If copy from system to device is needed + * @param [in] coeff_count Coefficient size + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_coeff(bm_handle_t handle, + struct bm_mem_desc *dev_mem, + struct bm_mem_desc sys_mem, + bool need_copy, + int coeff_count); +/** + * @name bm_mem_convert_system_to_device_coeff_byte + * @brief To malloc a piece of device memory according to the size of + * coefficient (in bytes); copy coefficient from system + * memory to device memory if need_copy is true. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dev_mem The device memory descriptor + * @param [in] sys_mem The system memory descriptor + * @param [in] need_copy If copy from system to device is needed + * @param [in] coeff_count Coefficient size + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_coeff_byte( + bm_handle_t handle, struct bm_mem_desc *dev_mem, struct bm_mem_desc sys_mem, + bool need_copy, int coeff_count); + +/*******************memory map functions *************************************/ +/** + * @name bm_mem_mmap_device_mem + * @brief To map a piece of device memory to user space with cache enabled. + * (only valid in SoC mode; Not supported in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dev_mem The device memory to map + * @param [out] vmem The virtual address of the mapped device memory + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_mem_mmap_device_mem(bm_handle_t handle, bm_device_mem_t *dmem, + + unsigned long long *vmem); + +/** + * @name sg_mem_mmap_device_mem + * @brief To map a piece of device memory to user space with cache enabled. + * (only valid in SoC mode; Not supported in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dev_mem The device memory to map + * @param [out] vmem The virtual address of the mapped device memory + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_mem_mmap_device_mem(bm_handle_t handle, sg_device_mem_t *dmem, + unsigned long long *vmem); + +/*******************memory map functions *************************************/ +/** + * @name bm_mem_mmap_device_mem_no_cache + * @brief To map a piece of device memory to user space with cache disabled. + * (only valid in SoC mode; Not supported in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dev_mem The device memory to map + * @param [out] vmem The virtual address of the mapped device memory + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_mem_mmap_device_mem_no_cache(bm_handle_t handle, bm_device_mem_t *dmem, + + unsigned long long *vmem); + +/** + * @name sg_mem_mmap_device_mem_no_cache + * @brief To map a piece of device memory to user space with cache disabled. + * (only valid in SoC mode; Not supported in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dev_mem The device memory to map + * @param [out] vmem The virtual address of the mapped device memory + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_mem_mmap_device_mem_no_cache(bm_handle_t handle, sg_device_mem_t *dmem, + unsigned long long *vmem); + +/** + * @name bm_mem_vir_to_phy + * @brief To get device mem address through the mapped virtual address . + * (only valid in SoC mode; Not supported in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] vmem The virtual address of the mapped device memory + * @param [out] dev_mem The device memory address + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_mem_vir_to_phy(bm_handle_t handle, unsigned long long vmem, + unsigned long long *device_mem); +/** + * @name bm_mem_invalidate_device_mem + * @brief To invalidate a piece of mapped device memory to maintain + * cache coherence + * (only valid in SoC mode; Not supported in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dmem The device memory to invalidate + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ + +DECL_EXPORT bm_status_t bm_mem_invalidate_device_mem(bm_handle_t handle, + bm_device_mem_t *dmem); + +/** + * @name sg_mem_invalidate_device_mem + * @brief To invalidate a piece of mapped device memory to maintain + * cache coherence + * (only valid in SoC mode; Not supported in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dmem The device memory to invalidate + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ + +DECL_EXPORT bm_status_t sg_mem_invalidate_device_mem(bm_handle_t handle, + sg_device_mem_t *dmem); + +/** + * @name bm_mem_invalidate_partial_device_mem + * @brief To invalidate part of mapped device memory to maintain + * cache coherence + * (only valid in SoC mode; Not supported in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dmem The device memory to invalidate + * @param [in] offset The offset of device memory address + * @param [in] len The length of memory to invalidate in bytes + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_mem_invalidate_partial_device_mem(bm_handle_t handle, + bm_device_mem_t *dmem, + unsigned int offset, + unsigned int len); + +/** + * @name sg_mem_invalidate_partial_device_mem + * @brief To invalidate part of mapped device memory to maintain + * cache coherence + * (only valid in SoC mode; Not supported in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dmem The device memory to invalidate + * @param [in] offset The offset of device memory address + * @param [in] len The length of memory to invalidate in bytes + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_mem_invalidate_partial_device_mem(bm_handle_t handle, + sg_device_mem_t *dmem, + unsigned long long offset, + unsigned long long len); + +/** + * @name bm_mem_flush_device_mem + * @brief To flush a piece of mapped device memory to maintain + * cache coherence + * (only valid in SoC mode; Not supported in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dmem The device memory to flush + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_mem_flush_device_mem(bm_handle_t handle, bm_device_mem_t *dmem); + +/** + * @name sg_mem_flush_device_mem + * @brief To flush a piece of mapped device memory to maintain + * cache coherence + * (only valid in SoC mode; Not supported in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dmem The device memory to flush + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_mem_flush_device_mem(bm_handle_t handle, sg_device_mem_t *dmem); + +/** + * @name bm_mem_flush_partial_device_mem + * @brief To flush part of mapped device memory to maintain + * cache coherence + * (only valid in SoC mode; Not supported in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dmem The device memory to flush + * @param [in] offset The offset of device memory address + * @param [in] len The length of memory to flush in bytes + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_mem_flush_partial_device_mem(bm_handle_t handle, + bm_device_mem_t *dmem, + unsigned int offset, + unsigned int len); + +/** + * @name sg_mem_flush_partial_device_mem + * @brief To flush part of mapped device memory to maintain + * cache coherence + * (only valid in SoC mode; Not supported in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] dmem The device memory to flush + * @param [in] offset The offset of device memory address + * @param [in] len The length of memory to flush in bytes + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_mem_flush_partial_device_mem(bm_handle_t handle, + sg_device_mem_t *dmem, + unsigned long long offset, + unsigned long long len); + +/** + * @name bm_mem_unmap_device_mem + * @brief To unmap a piece of mapped device memory + * (only valid in SoC mode; Not supported in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] vmem The virtual address of the mapped device memory + * @param [in] size The size of unmapped memory + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_mem_unmap_device_mem(bm_handle_t handle, void *vmem, int size); + +/** + * @name sg_mem_unmap_device_mem + * @brief To unmap a piece of mapped device memory + * (only valid in SoC mode; Not supported in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] vmem The virtual address of the mapped device memory + * @param [in] size The size of unmapped memory + * + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t sg_mem_unmap_device_mem(bm_handle_t handle, void *vmem, unsigned long long size); + +/*******************api(kernel) functions *************************************/ +/** + * @name bm_flush + * @brief To synchronize APIs of the current thread. The thread will block + * until all the outstanding APIs of the current thread are finished. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + */ +DECL_EXPORT void bm_flush(bm_handle_t handle); + +/** + * @name bm_device_sync + * @brief To synchronize APIs of the device. The thread will block + * until all the outstanding APIs of the device are finished. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_device_sync(bm_handle_t handle); + +/** + * @name bm_handle_sync + * @brief To synchronize APIs of the handle. The thread will block + * until all the outstanding APIs of the handle are finished. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_handle_sync(bm_handle_t handle); + +/** + * @name bm_handle_sync_from_core + * @brief To synchronize APIs of the handle. The thread will block + * until all the outstanding APIs of the handle are finished. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] core_id The core id + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_handle_sync_from_core(bm_handle_t handle, int core_id); + +/** + * @name bm_thread_sync + * @brief To synchronize APIs of the current thread. The thread will block + * until all the outstanding APIs of the current thread are finished. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_thread_sync(bm_handle_t handle); + +/** + * @name bm_thread_sync_from_core + * @brief To synchronize APIs of the current thread. The thread will block + * until all the outstanding APIs of the current thread are finished. + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] core_id The core id + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_thread_sync_from_core(bm_handle_t handle, int core_id); + +/*******************trace and profile releated functions **********************/ +typedef struct bm_profile { +#ifdef __linux__ + unsigned long cdma_in_time; + unsigned long cdma_in_counter; + unsigned long cdma_out_time; + unsigned long cdma_out_counter; + unsigned long tpu_process_time; + unsigned long tpu1_process_time; + unsigned long sent_api_counter; + unsigned long completed_api_counter; +#else + unsigned long long cdma_in_time; + unsigned long long cdma_in_counter; + unsigned long long cdma_out_time; + unsigned long long cdma_out_counter; + unsigned long long tpu_process_time; + unsigned long long tpu1_process_time; + unsigned long long sent_api_counter; + unsigned long long completed_api_counter; +#endif +} bm_profile_t; +/** + * @name bm_get_profile + * @brief To get the profile data at the moment + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] profile The result profile data + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_profile(bm_handle_t handle, bm_profile_t *profile); + +typedef struct bootloader_version{ + char *bl1_version; + char *bl2_version; + char *bl31_version; + char *uboot_version; +} boot_loader_version; + +/** + * @name bm_get_boot_loader_version + * @brief To get the boot_loader_version + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] version The result version data + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_boot_loader_version(bm_handle_t handle, boot_loader_version *version); + +/** + * @name bm_get_vpu_instant_usage + * @brief To get vpu usage + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] smi_attr The result vpu usage + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_vpu_instant_usage(bm_handle_t handle, int *vpu_usage); + +/** + * @name bm_get_jpu_core_usage + * @brief To get the jpu usage + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] smi_attr The result jpu usage + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_jpu_core_usage(bm_handle_t handle, int *jpu_usage); + +/** + * @name bm_get_vpp_instant_usage + * @brief To get the vpp usage + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] smi_attr The result vpp usage + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_vpp_instant_usage(bm_handle_t handle, int *vpp_usage); +/** + * @name bm_get_last_api_process_time_us + * @brief This function is abandoned. + */ +#ifdef __linux__ +DECL_EXPORT bm_status_t bm_get_last_api_process_time_us(bm_handle_t handle, + unsigned long *time_us); +#else +DECL_EXPORT bm_status_t bm_get_last_api_process_time_us(bm_handle_t handle, + unsigned long long *time_us); +#endif +/*******************tpu clock and module reset releated functions *************/ + +/** + * @name bm_set_clk_tpu_freq + * @brief To set the clock frequency of TPU (only valid in PCIE mode). + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] freq The TPU target frequency + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_set_clk_tpu_freq(bm_handle_t handle, int freq); + +/** + * @name bm_get_clk_tpu_freq + * @brief To get the clock frequency of TPU + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] freq The current TPU frequency + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_clk_tpu_freq(bm_handle_t handle, int *freq); + +/*******************misc functions ********************************************/ +struct bm_misc_info { + int pcie_soc_mode; /*0---pcie; 1---soc*/ + int ddr_ecc_enable; /*0---disable; 1---enable*/ + long long ddr0a_size; + long long ddr0b_size; + long long ddr1_size; + long long ddr2_size; + unsigned int chipid; +#define BM1682_CHIPID_BIT_MASK (0X1 << 0) +#define BM1684_CHIPID_BIT_MASK (0X1 << 1) +#define BM1686_CHIPID_BIT_MASK (0X1 << 2) +#ifdef __linux__ + unsigned long chipid_bit_mask; +#else + unsigned long long chipid_bit_mask; +#endif + unsigned int driver_version; + int domain_bdf; + int board_version; /*hardware board version [23:16]-mcu sw version, [15:8]-board type, [7:0]-hw version*/ + int a53_enable; + int dyn_enable; +}; + +/** + * @name bm_get_misc_info + * @brief To get miscellaneous information of the device + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] pmisc_info The fetched misc info + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_misc_info(bm_handle_t handle, struct bm_misc_info *pmisc_info); + +/** + * @name bm_get_chipid + * @brief To get the chipid of the device. (0x1682 / 0x1684 / 0x168?) + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] p_chipid The chip id of the device + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_chipid(bm_handle_t handle, unsigned int *p_chipid); + +#define BMLIB_LOG_QUIET -8 +#define BMLIB_LOG_PANIC 0 +#define BMLIB_LOG_FATAL 8 +#define BMLIB_LOG_ERROR 16 +#define BMLIB_LOG_WARNING 24 +#define BMLIB_LOG_INFO 32 +#define BMLIB_LOG_VERBOSE 40 +#define BMLIB_LOG_DEBUG 48 +#define BMLIB_LOG_TRACE 56 + +/** + * @name bmlib_log_get_level + * @brief To get the bmlib log level + * @ingroup bmlib_log + * + * @param void + * @retval The level of bmlib log level + */ +DECL_EXPORT int bmlib_log_get_level(void); + +/** + * @name bmlib_log_set_level + * @brief To set the bmlib log level + * @ingroup bmlib_log + * + * @param [in] level The level of bmlib log level + * @retval void + */ +DECL_EXPORT void bmlib_log_set_level(int level); + +/** + * @name bmlib_log_set_callback + * @brief To set callback to get bmlib log + * @ingroup bmlib_log + * + * @param [in] callback The callback function to get bmlib log + * @retval void + */ +DECL_EXPORT void bmlib_log_set_callback(void (*callback)(const char*, int, const char*, va_list args)); + +/** + * @name bm_set_debug_mode + * @brief To set the debug mode for firmware log for tpu + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @param [in] mode The debug mode of fw log, 0/1 for disable/enable log + * @retval void + */ +DECL_EXPORT void bm_set_debug_mode(bm_handle_t handle, int mode); + +/** + * @name bmlib_api_dbg_callback + * @brief To set debug callback to get firmware log + * @ingroup bmlib_log + * + * @param [in] bmlib_api_dbg_callback callback to get firmware log + * @retval void + */ +typedef void (*bmlib_api_dbg_callback)(int, int, int, const char*); +// api, result, duratioin, log, third int for api duration for future +DECL_EXPORT void bmlib_set_api_dbg_callback(bmlib_api_dbg_callback callback); + +/** + * @name bmcpu_get_cpu_status + * @brief Get bmcpu status + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @retval BMCPU_RUNNING bmcpu is running. + * Other code Fails. + */ +DECL_EXPORT bm_cpu_status_t bmcpu_get_cpu_status(bm_handle_t handle); + +/** + * @name bmcpu_start_cpu + * @brief Start cpu in pcie mode + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @param [in] boot_file Fip file + * @param [in] core_file Itb file + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bmcpu_start_cpu(bm_handle_t handle, char *boot_file, char *core_file); + +/** + * @name bmcpu_open_process + * @brief Open a process to do some work + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @param [in] flags Process flags + * @param [in] timeout Timeout value in millisecond, -1 means default value of this device + * @retval >= 0 process handle + * < 0 Other code Fails. + */ +DECL_EXPORT int bmcpu_open_process(bm_handle_t handle, unsigned int flags, int timeout); + +/** + * @name bmcpu_load_library + * @brief Load a share library(so) to specific process + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @param [in] process_handle Process handle + * @param [in] library_file Library file path + * @param [in] timeout Timeout value in millisecond, -1 means default value of this device + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bmcpu_load_library(bm_handle_t handle, int process_handle, char *library_file, int timeout); + +/** + * @name bmcpu_unload_library + * @brief Load a share library(so) to specific process + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @param [in] process_handle Process handle + * @param [in] library_file Library file path + * @param [in] timeout Timeout value in millisecond, -1 means default value of this device + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bmcpu_unload_library(bm_handle_t handle, int process_handle, char *library_file, int timeout); + +/** + * @name bmcpu_exec_function + * @brief Execute specific function in specific process + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @param [in] process_handle Process handle + * @param [in] function_name Function name + * @param [in] function_param Function parameters + * @param [in] param_size Parameters size in bytes + * @param [in] timeout Timeout value in millisecond, -1 means default value of this device + * @retval 0 success. + * >0 code fails from bmlib + * <0 code fails from function + */ +DECL_EXPORT int bmcpu_exec_function(bm_handle_t handle, + int process_handle, + char *function_name, + void *function_param, + unsigned int param_size, + int timeout); + +#define BMCPU_EXEC_OPT_NO_FLUSH_CACHE 1 +/** + * @name bmcpu_exec_function_ext + * @brief Execute specific function in specific process + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @param [in] process_handle Process handle + * @param [in] function_name Function name + * @param [in] function_param Function parameters + * @param [in] param_size Parameters size in bytes + * @param [in] opt exec options + * @param [in] timeout Timeout value in millisecond, -1 means default value of this device + * @retval 0 success. + * >0 code fails from bmlib + * <0 code fails from function + */ +DECL_EXPORT int bmcpu_exec_function_ext(bm_handle_t handle, + int process_handle, + char *function_name, + void *function_param, + unsigned int param_size, + unsigned int opt, + int timeout); + +/** + * @name bmcpu_exec_function_async + * @brief Execute specific function in specific process asynchronous + * user should use bm_query_exec_function_result to query result + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @param [in] process_handle Process handle + * @param [in] function_name Function name + * @param [in] function_param Function param + * @param [in] param_size Param size in bytes + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bmcpu_exec_function_async(bm_handle_t handle, + int process_handle, + char *function_name, + void *function_param, + unsigned int param_size, + unsigned long long *api_handle); + +/** + * @name bmcpu_exec_function_async_ext + * @brief Execute specific function in specific process asynchronous + * user should use bm_query_exec_function_result to query result + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @param [in] process_handle Process handle + * @param [in] function_name Function name + * @param [in] function_param Function param + * @param [in] param_size Param size in bytes + * @param [in] opt exec options + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bmcpu_exec_function_async_ext(bm_handle_t handle, + int process_handle, + char *function_name, + void *function_param, + unsigned int param_size, + unsigned int opt, + unsigned long long *api_handle); + +/** + * @name bmcpu_query_exec_function_result + * @brief Query result from function called by bm_exec_function + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @param [in] api_handle Api handle return by bm_exec_function_async + * @param [in] timeout Timeout value in millisecond, -1 means default value of this device + * @retval 0 success. + * >0 code fails from bmlib + * <0 code fails from function + */ +DECL_EXPORT int bmcpu_query_exec_function_result(bm_handle_t handle, unsigned long long api_handle, int timeout); + +/** + * @name bmcpu_map_phys_addr + * @brief Map physical address in specific process + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @param [in] process_handle Process handle + * @param [in] phys_addr Physical address + * @param [in] size Map size in bytes + * @param [in] timeout Timeout value in millisecond, -1 means default value of this device + * @retval >0 virtual address + * 0 fails + */ +DECL_EXPORT void *bmcpu_map_phys_addr(bm_handle_t handle, int process_handle, void *phys_addr, unsigned int size, int timeout); + +/** + * @name bmcpu_unmap_phys_addr + * @brief Unmap physical address in specific process + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @param [in] process_handle Process handle + * @param [in] phys_addr Physical address + * @param [in] timeout Timeout value in millisecond, -1 means default value of this device + * @retval <0 fail + * 0 success + */ +DECL_EXPORT bm_status_t bmcpu_unmap_phys_addr(bm_handle_t handle, int process_handle, void *phys_addr, int timeout); + +/** + * @name bmcpu_close_process + * @brief Close process + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @param [in] process_handle Process handle + * @param [in] timeout Timeout value in millisecond, -1 means default value of this device + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bmcpu_close_process(bm_handle_t handle, int process_handle, int timeout); + +/** + * @name bmcpu_reset_cpu + * @brief Reset cpu in pcie mode + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bmcpu_reset_cpu(bm_handle_t handle); + +/** + * @name bm_enable_perf_monitor + * @brief enable perf monitor to get gdma and tpu performance data + * @ingroup bmlib_perf + * + * @param [in] handle The device handle + * @param [in] perf_monitor The monitor to perf + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_enable_perf_monitor(bm_handle_t handle, bm_perf_monitor_t *perf_monitor); + +/** + * @name bm_disable_perf_monitor + * @brief disable perf monitor to get gdma and tpu performance data + * @ingroup bmlib_perf + * + * @param [in] handle The device handle + * @param [in] perf_monitor The monitor to perf + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_disable_perf_monitor(bm_handle_t handle, bm_perf_monitor_t *perf_monitor); + +/** + * @name bmcpu_set_log + * @brief Set cpu log options + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @param [in] log_level 0: DEBUG 1:INFO 2:WARN 3:ERROR 4:FATAL + * @param [in] log_to_console 1: YES 0: No + * @param [in] timeout Timeout value in millisecond, -1 means default value of this device + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bmcpu_set_log(bm_handle_t handle, unsigned int log_level, unsigned int log_to_console, int timeout); + +/** + * @name bmcpu_get_log + * @brief Get cpu log file + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @param [in] process_handle Process handle + * @param [in] log_file save log as file + * @param [in] timeout Timeout value in millisecond, -1 means default value of this device + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bmcpu_get_log(bm_handle_t handle, int process_handle, char *log_file, int timeout); + +/** + * @name bmcpu_sync_time + * @brief Sync device cpu time with host + * @ingroup bmlib_log + * + * @param [in] handle The device handle + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bmcpu_sync_time(bm_handle_t handle); + +/*******************trace and profile releated functions **********************/ +struct bm_heap_stat { + unsigned int mem_total; + unsigned int mem_avail; + unsigned int mem_used; +}; + +typedef struct bm_heap_stat_byte { + unsigned int heap_id; + unsigned long long mem_total; + unsigned long long mem_avail; + unsigned long long mem_used; + unsigned long long mem_start_addr; +} bm_heap_stat_byte_t; + +typedef struct bm_dev_stat { + int mem_total; + int mem_used; + int tpu_util; + int heap_num; + struct bm_heap_stat heap_stat[4]; +} bm_dev_stat_t; + +/** + * @name bm_get_stat + * @brief To get the stat data at the moment + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] profile The result stat data + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_stat(bm_handle_t handle, bm_dev_stat_t *stat); + +/** + * @name bm_get_gmem_heap_id + * @brief To get the heap id of allocated global memory + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] pmem The allocted global memory + * @param [out] heapid The result of get heap id + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ + +DECL_EXPORT bm_status_t bm_get_gmem_heap_id(bm_handle_t handle, bm_device_mem_t *pmem, unsigned int *heapid); + +/** + * @name sg_get_gmem_heap_id + * @brief To get the heap id of allocated global memory + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] pmem The allocted global memory + * @param [out] heapid The result of get heap id + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ + +DECL_EXPORT bm_status_t sg_get_gmem_heap_id(bm_handle_t handle, sg_device_mem_t *pmem, unsigned int *heapid); + +/** + * @name bm_get_gmem_total_heap_num + * @brief To get the total heap num of global memory + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] heap_num The result of get total num + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_gmem_total_heap_num(bm_handle_t handle, unsigned int *heap_num); + +/** + * @name bm_get_gmem_heap_stat_byte_by_id + * @brief To get the heap stat by heap id + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] heap_id The heap index to get heap status + * @param [out] pheap_byte The result of get heap status + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_gmem_heap_stat_byte_by_id(bm_handle_t handle, bm_heap_stat_byte_t *pheap_byte, unsigned int heap_id); + +DECL_EXPORT bm_status_t bm_load_firmware( + bm_handle_t handle, + const char *firmware_tcm, + const char *firmware_ddr); + +#define bmkernel_load_firmware okkernel_load_firmware +DECL_EXPORT bm_status_t okkernel_load_firmware( + bm_handle_t handle, + const char *firmware_tcm, + const char *firmware_ddr); + +DECL_EXPORT bm_status_t okkernel_launch_async( + bm_handle_t handle, + const char *func_name, + const void *args, + unsigned int size); + +DECL_EXPORT bm_status_t okkernel_launch_sync( + bm_handle_t handle, + const char *func_name, + const void *args, + unsigned int size); + +DECL_EXPORT bm_status_t tpu_kernel_launch_sync( + bm_handle_t handle, + const char *func_name, + const void *args, + unsigned int size); + +DECL_EXPORT bm_status_t okkernel_sync(bm_handle_t handle); + +/** + * @name bmkernel_launch + * @brief send api to device and launch function + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] api cmd struct pointer + * @param [in] api cmd length + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bmkernel_launch(bm_handle_t handle, const void *args, + unsigned int size); + +/** + * @name bmkernel_load_lookup_table + * @brief load lookup table to l2-sram + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [in] table which loaded to l2-sram + * @param [in] table size + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bmkernel_load_lookup_table(bm_handle_t handle, const void* table, unsigned int size); + +/*******************device management api functions ********************************************/ +/** + * @name bm_get_tpu_current + * @brief get tpu current + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] tpuc(mA) The pointer for tpu current + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_tpu_current(bm_handle_t handle, unsigned int *tpuc); + +/** + * @name bm_get_board_max_power + * @brief get board support max power + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] maxp The pointer for maxp + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_board_max_power(bm_handle_t handle, unsigned int *maxp); + +/** + * @name bm_get_board_power + * @brief get board power + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] boardp The pointer for boardp + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_board_power(bm_handle_t handle, unsigned int *boardp); + +/** + * @name bm_get_fan_speed + * @brief get board fan speed + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] fan The pointer for fan speed + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_fan_speed(bm_handle_t handle, unsigned int *fan); + +/** + * @name bm_get_ecc_correct_num + * @brief get ecc_correct_num + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] ecc_correct_num + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +#ifdef __linux__ +DECL_EXPORT bm_status_t bm_get_ecc_correct_num(bm_handle_t handle, unsigned long *ecc_correct_num); +#else +DECL_EXPORT bm_status_t bm_get_ecc_correct_num(bm_handle_t handle, unsigned long long *ecc_correct_num); +#endif +/** + * @name bm_get_12v_atx + * @brief get atx_12v + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] atx_12v + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_12v_atx(bm_handle_t handle, int *atx_12v); + +/** + * @name bm_get_product_sn + * @brief get SE5 sn + * @ingroup device management api + * + * @param [out] product_sn + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_product_sn(char *product_sn); + +/** + * @name bm_get_sn + * @brief get sn + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] sn + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_sn(bm_handle_t handle, char *sn); + +/** + * @name bm_get_status + * @brief get chip status + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] status The board error status, each bit represents an error state + * status == 0x0, borad is nornal, staus > 0, borad is abnormal; + * bit0 == 1, tpu is hang + * bit1 == 1, pcie link abnormal + * bit2 == 1, board temperature is too high + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_status(bm_handle_t handle, int *status); + +/** + * @name bm_get_tpu_maxclk + * @brief get tpu_maxclk + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] tpu_maxclk + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_tpu_maxclk(bm_handle_t handle, unsigned int *tpu_maxclk); + +/** + * @name bm_get_tpu_minclk + * @brief get tpu_minclk + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] tpu_minclk + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_tpu_minclk(bm_handle_t handle, unsigned int *tpu_minclk); + +/** + * @name bm_get_driver_version + * @brief get driver version + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] driver_version + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_driver_version(bm_handle_t handle, int *driver_version); + +/** + * @name bm_get_board_name + * @brief get device board name + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] board_name + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_board_name(bm_handle_t handle, char *name); + +/** + * @name bm_get_board_temp + * @brief get board temperature + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] board_temp + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_board_temp(bm_handle_t handle, unsigned int *board_temp); + +/** + * @name bm_get_chip_temp + * @brief get chip temperature + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] chip_temp + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_chip_temp(bm_handle_t handle, unsigned int *chip_temp); + +/** + * @name bm_get_tpu_power + * @brief get TPU power + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] tpu_power + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_tpu_power(bm_handle_t handle, float *tpu_power); + +/** + * @name bm_get_tpu_volt + * @brief get TPU voltage + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] tpu_volt + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_tpu_volt(bm_handle_t handle, unsigned int *tpu_volt); + +/** + * @name bm_get_card_id + * @brief get card id + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] card_id + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_card_id(bm_handle_t handle, unsigned int *card_id); + +/** + * @name bm_get_card_num + * @brief get card number + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] card_id + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_card_num(unsigned int *card_num); + +/** + * @name bm_get_chip_num_from_card + * @brief get chip number and start chip id from card + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] chip_num + * @param [out] dev_start_index + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_chip_num_from_card(unsigned int card_id, unsigned int *chip_num, unsigned int *dev_start_index); + +/** + * @name bm_get_dynfreq_status + * @brief get chip dynamic freq status + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [out] dynfreq_status + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_dynfreq_status(bm_handle_t handle, int *dynfreq_status); + +/** + * @name bm_change_dynfreq_status + * @brief change(enable/disable) chip dynamic freq status + * @ingroup device management api + * + * @param [in] handle The device handle + * @param [in] new_status + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_change_dynfreq_status(bm_handle_t handle, int new_status); + +/** + * @name bm_get_tpu_scalar_num + * @brief To get the core number of TPU scalar + * @ingroup bmlib_runtime + * + * @param [in] handle The device handle + * @param [out] core_num The core number of TPU scalar + * @retval BM_SUCCESS Succeeds. + * Other code Fails. + */ +DECL_EXPORT bm_status_t bm_get_tpu_scalar_num(bm_handle_t handle, unsigned int *core_num); + +#define bm_get_tpu_core_num bm_get_tpu_scalar_num + +#if defined(__cplusplus) +} +#endif + +#endif /* BM_RUNTIME_H_ */ diff --git a/models/Baichuan2/src/include/bmruntime_interface.h b/models/Baichuan2/src/include/bmruntime_interface.h new file mode 100644 index 0000000..cbf6964 --- /dev/null +++ b/models/Baichuan2/src/include/bmruntime_interface.h @@ -0,0 +1,404 @@ +/***************************************************************************** + * + * Copyright (c) 2016-2026 by Sophgo Technologies Inc. All rights reserved. + * + * The material in this file is confidential and contains trade secrets + * of Sophgo Technologies Inc. This is proprietary information owned by + * Sophgo Technologies Inc. No part of this work may be disclosed, + * reproduced, copied, transmitted, or used in any way for any purpose, + * without the express written permission of Sophgo Technologies Inc. + * + *****************************************************************************/ + +/***************************************************************************** + * BMRuntime Interface is mainly for inference. + * Also we can use it for device computation from BMLang programming. + * Note: please use interface from bmlib_runtime.h for device memory operation. + ****************************************************************************/ + +#ifndef BMRUNTIME_INTERFACE_H_ +#define BMRUNTIME_INTERFACE_H_ + +#include "bmdef.h" + +#ifdef _WIN32 +#define DECL_EXPORT _declspec(dllexport) +#define DECL_IMPORT _declspec(dllimport) +#else +#define DECL_EXPORT +#define DECL_IMPORT +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +/* --------------------------------------------------------------------------*/ +/* interface for basic data type */ + +/* get data type byte size */ +DECL_EXPORT size_t bmrt_data_type_size(bm_data_type_t dtype); + +/* +dims array to bm_shape_t, +shape and dims should not be NULL, num_dims should not be larger than BM_MAX_DIMS_NUM */ +DECL_EXPORT void bmrt_shape(bm_shape_t* shape, const int* dims, int num_dims); + +/* +number of shape elements, shape should not be NULL and num_dims should not large than +BM_MAX_DIMS_NUM */ +DECL_EXPORT uint64_t bmrt_shape_count(const bm_shape_t* shape); + +/* compare whether two shape is same */ +DECL_EXPORT bool bmrt_shape_is_same(const bm_shape_t* left, const bm_shape_t* right); + +/* +fill a tensor with data type and shape, and st_mode = 0 as default. +tensor and p_bmrt should not be NULL, shape count should not be 0. +it will alloc device mem to tensor->device_mem, so user should bmrt_free_device(p_bmrt, +tensor->device_mem) to free it.*/ +DECL_EXPORT bool bmrt_tensor(bm_tensor_t* tensor, void* p_bmrt, bm_data_type_t dtype, bm_shape_t shape); + +/* +fill a tensor with data type and shape, and st_mode = 0 as default. +tensor and p_bmrt should not be NULL, shape count should not be 0. +it will alloc device mem to tensor->device_mem on devid-th device.*/ +DECL_EXPORT bool bmrt_tensor_ex(bm_tensor_t* tensor, void* p_bmrt, int devid, bm_data_type_t dtype, bm_shape_t shape); + +/* fill a tensor with device mem existed, tensor byte size should not large than device mem size */ +DECL_EXPORT void bmrt_tensor_with_device(bm_tensor_t* tensor, bm_device_mem_t device_mem, + bm_data_type_t dtype, bm_shape_t shape); + +/* get tensor bytes size, tensor should not be NULL */ +DECL_EXPORT size_t bmrt_tensor_bytesize(const bm_tensor_t* tensor); + +/* get tensor mem size allocated in device mem, tensor should not be NULL */ +DECL_EXPORT size_t bmrt_tensor_device_size(const bm_tensor_t* tensor); + +/* print net info for debug */ +DECL_EXPORT void bmrt_print_network_info(const bm_net_info_t* net_info); + +/* --------------------------------------------------------------------------*/ +/** + * @name bmrt_create + * @brief To create the bmruntime with bm_handle. + * @ingroup bmruntime + * + * This API creates the bmruntime. It returns a void* pointer which is the pointer + * of bmruntime. Device id is set when get bm_handle; + * + * @param [in] bm_handle bm handle. It must be initialized by using bmlib. + * + * @retval void* the pointer of bmruntime + */ +DECL_EXPORT void* bmrt_create(bm_handle_t bm_handle); + +/* --------------------------------------------------------------------------*/ +/** + * @name bmrt_create_ex + * @brief To create the bmruntime with one or more bm_handle. + * @ingroup bmruntime + * + * This API creates the bmruntime. It returns a void* pointer which is the pointer + * of bmruntime. + * + * @param [in] bm_handles bm handles. They must be initialized by using bmlib. + * @param [in] num_handles number of bm_handles. + * + * @retval void* the pointer of bmruntime + */ +DECL_EXPORT void *bmrt_create_ex(bm_handle_t *bm_handles, int num_handles); + +/** + * @name bmrt_destroy + * @brief To destroy the bmruntime pointer + * @ingroup bmruntime + * + * This API destroy the bmruntime. + * + * @param [in] p_bmrt Bmruntime that had been created + */ +DECL_EXPORT void bmrt_destroy(void* p_bmrt); + +/** + * @name bmrt_get_bm_handle + * @brief To get the BM runtime context. + * @ingroup bmruntime + * + * This API get the BM runtime context for using BMDNN, BMCV or BMLIB + * + * @param [in] p_bmrt Bmruntime that had been created + */ +DECL_EXPORT void * bmrt_get_bm_handle(void* p_bmrt); + +/** + * @name bmrt_load_bmodel + * @brief To load the bmodel which is created by BM compiler + * @ingroup bmruntime + * + * This API is to load bmodel created by BM compiler. + * After loading bmodel, we can run the inference of neuron network. + * + * @param [in] p_bmrt Bmruntime that had been created + * @param [in] bmodel_path Bmodel file directory. + * + * @retval true Load context sucess. + * @retval false Load context failed. + */ +DECL_EXPORT bool bmrt_load_bmodel(void* p_bmrt, const char *bmodel_path); + +/** + * @name bmrt_load_bmodel_data + * @brief To load the bmodel which is created by BM compiler from buffer + * @ingroup bmruntime + * + * This API is to load bmodel created by BM compiler. + * After loading bmodel, we can run the inference of neuron network. + * Different with bmrt_load_bmodel, bmodel is the data in host memory. + * + * @param [in] p_bmrt Bmruntime that had been created + * @param [in] bmodel_data Bmodel data pointer to buffer + * @param [in] size Bmodel data size + * + * @retval true Load context sucess. + * @retval false Load context failed. + */ +DECL_EXPORT bool bmrt_load_bmodel_data(void* p_bmrt, const void * bmodel_data, size_t size); + +/** + * @name bmrt_show_neuron_network + * @brief To print the name of all neuron network + * @ingroup bmruntime + * + * @param [in] p_bmrt Bmruntime that had been created + */ +DECL_EXPORT void bmrt_show_neuron_network(void* p_bmrt); + +/** + * @name bmrt_get_network_number + * @brief To get the number of neuron network in the bmruntime + * @ingroup bmruntime + * + * @param [in] p_bmrt Bmruntime that had been created + * + * @retval int value The number of neuron networks. + */ +DECL_EXPORT int bmrt_get_network_number(void* p_bmrt); + +/** + * @name bmrt_get_network_names + * @brief To get the names of all neuron network in the bmruntime + * @ingroup bmruntime + * + * @param [in] p_bmrt Bmruntime that had been created + * @param [out] network_names The names of all neuron networks. It should be declare as (const char** networks_ = NULL), + * and use as the param &networks_. After this API, user need to free(networks_) if user + * do not need it. + */ +DECL_EXPORT void bmrt_get_network_names(void* p_bmrt, const char*** network_names); + +/** + * @name bmrt_get_network_info + * @brief To get network info by net name + * @ingroup bmruntime + * + * @param [in] p_bmrt Bmruntime that had been created + * @param [in] net_name Network name + * + * @retval bm_net_info_t* Pointer to net info, needn't free by user; if net name not found, will return NULL. + */ +DECL_EXPORT const bm_net_info_t* bmrt_get_network_info(void* p_bmrt, const char* net_name); + +/** + * @name bmrt_launch_tensor + * @brief To launch the inference of the neuron network with setting input tensors + * @ingroup bmruntime + * + * This API supports the neuron nework that is static-compiled or dynamic-compiled + * After calling this API, inference on TPU is launched. And the CPU program will not + * be blocked. bm_thread_sync should be called to make sure inference finished. + * This API support multiple inputs, and multi thread safety + * + * @param [in] p_bmrt Bmruntime that had been created + * @param [in] net_name The name of the neuron network + * @param [in] input_tensors Array of input tensor, defined like bm_tensor_t input_tensors[input_num]. + * User should initialize each input tensor. + * @param [in] input_num Input number + * @param [out] output_tensors Array of output tensor, defined like bm_tensor_t output_tensors[output_num]. + * This interface will alloc devcie mem to store output data. User should free each + * device mem by bm_free_device after the result data not used. + * @param [in] output_num Output number + * + * @retval true Launch success. + * @retval false Launch failed. + */ +DECL_EXPORT bool bmrt_launch_tensor(void* p_bmrt, const char * net_name, const bm_tensor_t input_tensors[], int input_num, + bm_tensor_t output_tensors[], int output_num); + +/** + * @name bmrt_launch_tensor_ex + * @brief To launch the inference of the neuron network with setting input tensors + * @ingroup bmruntime + * + * This API supports the neuron nework that is static-compiled or dynamic-compiled + * After calling this API, inference on TPU is launched. And the CPU program will not + * be blocked. bm_thread_sync should be called to make sure inference finished. + * This API support multiple inputs, and multi thread safety + * + * @param [in] p_bmrt Bmruntime that had been created + * @param [in] net_name The name of the neuron network + * @param [in] input_tensors Array of input tensor, defined like bm_tensor_t input_tensors[input_num], + * User should initialize each input tensor. + * @param [in] input_num Input number + * @param [out] output_tensors Array of output tensor, defined like bm_tensor_t output_tensors[output_num]. + * User can set device_mem or stmode of output tensors. If user_mem is true, this interface + * will use device mem of output_tensors to store output data, and not alloc device mem; + * Or it will alloc device mem to store output. If user_stmode is true, it will use stmode in + * each output tensor; Or stmode will be BM_STORE_1N as default. + * @param [in] output_num Output number + * @param [in] user_mem whether device_mem of output tensors are set + * @param [in] user_stmode whether stmode of output tensors are set + * + * @retval true Launch success. + * @retval false Launch failed. + */ +DECL_EXPORT bool bmrt_launch_tensor_ex(void* p_bmrt, const char * net_name, const bm_tensor_t input_tensors[], int input_num, + bm_tensor_t output_tensors[], int output_num, bool user_mem, bool user_stmode); + +/** + * @name bmrt_launch_data + * @brief To launch the inference of the neuron network with setting input datas in system memory + * @ingroup bmruntime + * + * This API supports the neuron nework that is static-compiled or dynamic-compiled + * After calling this API, inference on TPU is launched. And the CPU + * program will be blocked. + * This API support multiple inputs, and multi thread safety + * + * @param [in] p_bmrt Bmruntime that had been created + * @param [in] net_name The name of the neuron network + * @param [in] input_datas Array of input data, defined like void * input_datas[input_num]. User should + * initialize each data pointer as input. + * @param [in] input_shapes Array of input shape, defined like bm_shape_t input_shapes[input_num]. + * User should set each input shape + * @param [in] input_num Input number + * @param [out] output_datas Array of output data, defined like void * output_datas[output_num]. + * If user don't alloc each output data, set user_mem to false, and this api will alloc + * output mem, user should free each output mem when output data not used. Also + * user can alloc system memory for each output data by self and set user_mem = true. + * @param [out] output_shapes Array of output shape, defined like bm_shape_t output_shapes[output_num]. + * It will store each output shape. + * @param [in] output_num Output number + * @param [in] user_mem whether output_datas[i] have allocated memory + * + * @retval true Launch success. + * @retval false Launch failed. + */ +DECL_EXPORT bool bmrt_launch_data(void* p_bmrt, const char* net_name, void* const input_datas[], + const bm_shape_t input_shapes[], int input_num, void * output_datas[], + bm_shape_t output_shapes[], int output_num, bool user_mem); + +/** + * @name bmrt_trace + * @brief To check runtime environment, and collect info for DEBUG + * @ingroup bmruntime + * + * This API is to collect runtime info for DEBUG. Expecially when launch result sudden mistake, call bmrt_trace + * will show whether device mems are broken, and other check info. + * + * @param [in] p_bmrt Bmruntime that had been created + */ +DECL_EXPORT void bmrt_trace(void* p_bmrt); + +/** + * @name bmrt_launch_tensor_multi_cores + * @brief To launch the inference of the neuron network with setting input tensors, and support multi core inference. + * @ingroup bmruntime + * + * This API supports the neuron nework that is static-compiled or dynamic-compiled + * After calling this API, inference on TPU is launched. And the CPU program will not + * be blocked. bm_thread_sync_from_core should be called to make sure inference is finished. + * This API support multiple inputs, and multi thread safety + * + * @param [in] p_bmrt Bmruntime that had been created + * @param [in] net_name The name of the neuron network + * @param [in] input_tensors Array of input tensor, defined like bm_tensor_t input_tensors[input_num], + * User should initialize each input tensor. + * @param [in] input_num Input number + * @param [out] output_tensors Array of output tensor, defined like bm_tensor_t output_tensors[output_num]. + * User can set device_mem or stmode of output tensors. If user_mem is true, this interface + * will use device mem of output_tensors to store output data, and not alloc device mem; + * Or it will alloc device mem to store output. If user_stmode is true, it will use stmode in + * each output tensor; Or stmode will be BM_STORE_1N as default. + * @param [in] output_num Output number + * @param [in] user_mem whether device_mem of output tensors are set + * @param [in] user_stmode whether stmode of output tensors are set + * @param [in] core_list core id list those will be used to inference + * @param [in] core_num number of the core list + * + * @retval true Launch success. + * @retval false Launch failed. + */ +DECL_EXPORT bool bmrt_launch_tensor_multi_cores( + void *p_bmrt, + const char *net_name, + const bm_tensor_t input_tensors[], + int input_num, + bm_tensor_t output_tensors[], + int output_num, + bool user_mem, + bool user_stmode, + const int *core_list, + int core_num); + +/** + * @name bmrt_memcpy_s2d_parallel + * @brief To copy data from system memory to muti-devices memory in parallel + * @ingroup bmruntime + * + * This API only could be used when the p_bmrt is created with bmrt_create_ex on multi devices. + * After calling this API, datas[:tensor_num[0]] will be copied to the first device, and + * datas[tensor_num[0]:tensor_num[0]+tensor_num[1]] will be copied to the second device and so on. + * The process of copying data to different devices is done in parallel and to the same device is in sequence. + * + * @param [in] p_bmrt Bmruntime that had been created with multi bm_handles + * @param [in] tensors Array of tensors that will be copied to devices + * @param [in] datas Array of satas allocated in system memory + * @param [in] tensor_num Array of tensor_num that will be copied to each device + * @param [in] device_num Device number +*/ +DECL_EXPORT bool bmrt_memcpy_s2d_parallel( + void *p_bmrt, + bm_tensor_t tensors[], + void *datas[], + int tensor_num[], + int device_num); + +/** + * @name bmrt_memcpy_d2s_parallel + * @brief To copy data from muti-devices memory to system memory in parallel + * @ingroup bmruntime + * + * This API only could be used when the p_bmrt is created with bmrt_create_ex on multi devices. + * After calling this API, tensors on the first device will be copied to datas[:tensor_num[0]] , and + * tensors on the second device will be copied to datas[tensor_num[0]:tensor_num[0]+tensor_num[1]] and so on. + * The process of copying data from different devices is done in parallel and from the same device is in sequence. + * + * @param [in] p_bmrt Bmruntime that had been created with multi bm_handles + * @param [in] datas Array of satas allocated in system memory + * @param [in] tensors Array of tensors that will be copied from devices + * @param [in] tensor_num Array of tensor_num that will be copied from each device + * @param [in] device_num Device number +*/ +DECL_EXPORT bool bmrt_memcpy_d2s_parallel( + void *p_bmrt, + void *datas[], + bm_tensor_t tensors[], + int tensor_num[], + int device_num); + +#if defined (__cplusplus) +} +#endif + +#endif diff --git a/models/Baichuan2/src/include/sentencepiece/sentencepiece_processor.h b/models/Baichuan2/src/include/sentencepiece/sentencepiece_processor.h new file mode 100644 index 0000000..14b1e8c --- /dev/null +++ b/models/Baichuan2/src/include/sentencepiece/sentencepiece_processor.h @@ -0,0 +1,727 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License.! + +#ifndef SENTENCEPIECE_PROCESSOR_H_ +#define SENTENCEPIECE_PROCESSOR_H_ + +#include +#include +#include +#include +#include +#include + +#ifndef SWIG +namespace absl { +using std::string_view; +} // namespace absl +#endif // SWIG + +namespace sentencepiece { +namespace util { + +enum class StatusCode : int { + kOk = 0, + kCancelled = 1, + kUnknown = 2, + kInvalidArgument = 3, + kDeadlineExceeded = 4, + kNotFound = 5, + kAlreadyExists = 6, + kPermissionDenied = 7, + kResourceExhausted = 8, + kFailedPrecondition = 9, + kAborted = 10, + kOutOfRange = 11, + kUnimplemented = 12, + kInternal = 13, + kUnavailable = 14, + kDataLoss = 15, + kUnauthenticated = 16, +}; + +class Status { + public: + Status(); + ~Status(); + Status(StatusCode code, absl::string_view error_message); + Status(const Status &s); + void operator=(const Status &s); + bool operator==(const Status &s) const; + bool operator!=(const Status &s) const; + inline bool ok() const { return rep_ == nullptr; } + + void set_error_message(const char *str); + const char *error_message() const; + const char *message() const { return error_message(); } + StatusCode code() const; + std::string ToString() const; + + void IgnoreError(); + + private: + struct Rep; + std::unique_ptr rep_; +}; +} // namespace util + +// SentencePieceProcessor: +// Simple and language independent tokenizer and de-tokenizer for +// Neural Network Machine Translation. +// +// SentencePieceProcessor provides Encode() and Decode() methods, +// which correspond to tokenization and de-tokenization respectively. +// +// - Encode: +// Given a raw source sentence, encode it into a sequence +// of pieces or vocabulary ids. +// +// - Decode: +// Given a sequence of pieces or vocabulary ids, decode it +// into a de-tokenized raw sentence. +// +// SentencePieceProcessor provides a lossless data conversion +// that allows the original raw sentence to be perfectly reconstructed +// from the encoded data, i.e., Decode(Encode(input)) == input. +// This characteristics is useful, as we can make the de-tokenization +// completely language independent. +// +// Usage: +// SentencePieceProcessor sp; +// sp.Load("//path/to/model"); +// +// vector sps; +// sp.Encode("hello world.", &sps).IgnoreError(); +// +// vector ids; +// sp.Encode("hello world.", &ids).IgnoreError(); +// +// string detok; +// sp.Decode(sps, &detok); +// CHECK_EQ("hello world.", detok).IgnoreError(); +// +// sp.Decode(ids, &detok); +// CHECK_EQ("hello world.", detok).IgnoreError(); +// +// We can also use SentencePieceText which manages the byte-offsets +// between user input (output) and internal sentence pieces. +// +// SentencePieceText spt; +// sp.Encode("hello world.", &spt); +// // Emits the byte range of each piece. +// for (const auto &piece : spt.pieces()) { +// LOG(INFO) << piece.begin() << " " << piece.end(); +// } +// +// sp.Decode({0, 1, 2, 3..}, &spt); +// for (const auto &piece : spt.pieces()) { +// LOG(INFO) << piece.begin() << " " << piece.end(); +// } +// + +class NBestSentencePieceText; +class ModelInterface; +class SentencePieceText; +class ModelProto; + +namespace normalizer { +class Normalizer; +} // namespace normalizer + +#ifndef SWIGGO +namespace util { +// Redefine std::string for serialized_proto interface as Python's string is +// a Unicode string. We can enforce the return value to be raw byte sequence +// with SWIG's typemap. +using bytes = std::string; +} // namespace util +#endif // SWIGGO + +class NBestSentencePieceText; +class ModelInterface; +class SentencePieceText; +class SentencePieceText_SentencePiece; + +// Wrapper class of SentencePieceText +// This wrapper only allows an immutable access to the proto and +// hides the actual implementation of protobuf. +// See sentencepiece.proto for the details of this class. +class ImmutableSentencePieceText_ImmutableSentencePiece { + public: + ImmutableSentencePieceText_ImmutableSentencePiece(); + ~ImmutableSentencePieceText_ImmutableSentencePiece() = default; + + const std::string &piece() const; + const std::string &surface() const; + uint32_t id() const; + uint32_t begin() const; + uint32_t end() const; + + friend class ImmutableSentencePieceText; + + private: + explicit ImmutableSentencePieceText_ImmutableSentencePiece( + const SentencePieceText_SentencePiece &sp); + const SentencePieceText_SentencePiece *sp_ = nullptr; +}; + +class ImmutableSentencePieceText { + public: + ImmutableSentencePieceText(); + virtual ~ImmutableSentencePieceText(); + + std::vector pieces() const; + + size_t pieces_size() const; + ImmutableSentencePieceText_ImmutableSentencePiece pieces(int index) const; + + const std::string &text() const; + float score() const; + + util::bytes SerializeAsString() const; + + // Returns the actual mutable proto. + // Do not use this outside of SentencePieceProcessor, as + // it returns the raw pointer managed by the shared_ptr. + SentencePieceText *mutable_proto(); + + // Converts the utf8 byte spans into Unicode char span. + void ConvertToUnicodeSpans(); + + friend class ImmutableNBestSentencePieceText; + + private: + explicit ImmutableSentencePieceText(const SentencePieceText &spt); + const SentencePieceText *spt_ = nullptr; + std::shared_ptr rep_; +}; + +// Wrapper class of SentencePieceText +// This wrapper only allows an immutable access to the proto and +// hides the actual implementation of protobuf. +// See sentencepiece.proto for the details of this class. +class ImmutableNBestSentencePieceText { + public: + ImmutableNBestSentencePieceText(); + virtual ~ImmutableNBestSentencePieceText(); + + std::vector nbests() const; + + size_t nbests_size() const; + ImmutableSentencePieceText nbests(int index) const; + + util::bytes SerializeAsString() const; + + // Returns the actual mutable proto. + // Do not use this outside of SentencePieceProcessor, as + // it returns the raw pointer managed by the shared_ptr. + NBestSentencePieceText *mutable_proto(); + + void ConvertToUnicodeSpans(); + + private: + std::shared_ptr rep_; +}; + +class SentencePieceProcessor { + public: + SentencePieceProcessor(); + virtual ~SentencePieceProcessor(); + + // Loads model from `filename`. + // Returns false if `filename` cannot be loaded. + virtual util::Status Load(absl::string_view filename); + + // Loads model from `filename`. + // Crash if `filename` cannot be loaded. + virtual void LoadOrDie(absl::string_view filename); + + // Loads model from `model_proto`. + // `model_proto` is copied. + virtual util::Status Load(const ModelProto &model_proto); + + // Loads model from `model_proto`. + // `model_proto` is moved. + virtual util::Status Load(std::unique_ptr model_proto); + + // Loads model from `serialized`, which is a string-serialized model proto. + // Useful to load the model from a platform independent blob object. + virtual util::Status LoadFromSerializedProto(absl::string_view serialized); + + // Returns the status. Encode/Decode methods are valid when status is OK. + virtual util::Status status() const; + + // Sets encode extra_option sequence. + virtual util::Status SetEncodeExtraOptions(absl::string_view extra_option); + + // Sets decode extra_option sequence. + virtual util::Status SetDecodeExtraOptions(absl::string_view extra_option); + + ////////////////////////////////////////////////////////////// + // Vocabulary restriction. + // Background: + // https://github.com/rsennrich/subword-nmt#best-practice-advice-for-byte-pair-encoding-in-nmt + + // Restricts the vocabulary set. + // The input sentences are encoded into the tokens in `valid_vocab`. + virtual util::Status SetVocabulary( + const std::vector &valid_vocab); + + // Reverts the vocabulary restriction. + virtual util::Status ResetVocabulary(); + + // Loads the valid vocabulary set from `filename` in TSV format. + // Format: . + // Any token with frequency < threshold will be treated as OOV. + virtual util::Status LoadVocabulary(absl::string_view filename, + int threshold); + + ////////////////////////////////////////////////////////////// + // Simple Encode and Decode API. + // + // Given a UTF8 input, encodes it into a sequence of sentence pieces. + virtual util::Status Encode(absl::string_view input, + std::vector *pieces) const; + + // Given a UTF8 input, encodes it into a sequence of ids. + virtual util::Status Encode(absl::string_view input, + std::vector *ids) const; + + // Given a sequence of pieces, decodes it into a detokenized output. + virtual util::Status Decode(const std::vector &pieces, + std::string *detokenized) const; + + // Given a sequence of pieces, decodes it into a detokenized output. + virtual util::Status Decode(const std::vector &pieces, + std::string *detokenized) const; + + // Given a sequence of ids, decodes it into a detokenized output. + virtual util::Status Decode(const std::vector &ids, + std::string *detokenized) const; + + ////////////////////////////////////////////////////////////// + // NBest API. + // + // Same as Encode, but returns nbest results. + virtual util::Status NBestEncode( + absl::string_view input, int nbest_size, + std::vector> *pieces) const; + + // Same as Encode, but returns nbest results. + virtual util::Status NBestEncode(absl::string_view input, int nbest_size, + std::vector> *ids) const; + + ////////////////////////////////////////////////////////////// + // Sampling API. + // + // Unigram and BPE support sampling mode. + // - Unigram (--model_type=unigram): + // `nbest_size`: When `nbest_size` is positive value, approximately samples + // one segmentation from nbest candidates. When `nbest_size` is negative + // value, samples one segmentation from the hypotheses (Lattice) according to + // the generation probabilities using forward-filtering and backward-sampling + // algorithm. + // `alpha`: Smoothing parameter (inverse temperature). The best segmentation + // (Viterbi segmentation) is more likely sampled when setting larger alpha. + // When alpha is 0.0, one segmentation is uniformly sampled from the nbest or + // lattice. `nbest_size` and `alpha` correspond to parameters `l` and `alpha` + // in https://arxiv.org/abs/1804.10959 (nbest_size < 0 means l = infinity) + // + // - BPE (--model_type=bpe): + // `alpha`: The dropout probability `p` of bpe merge operations in + // https://arxiv.org/abs/1910.13267 Nbest-based sampling is not supported so + // nbest_size parameter is ignored in BPE. + virtual util::Status SampleEncode(absl::string_view input, int nbest_size, + float alpha, + std::vector *pieces) const; + + // Same as above, but returns a sequence of ids. + virtual util::Status SampleEncode(absl::string_view input, int nbest_size, + float alpha, std::vector *ids) const; + + ////////////////////////////////////////////////////////////// + // SampleEncodeAndScore API. + // + // Sample `samples` many tokenisations from the segmentation lattice. + // These methods are only available in model_type=unigram. + // + // `alpha`: smoothing parameter (inverse temperature). The same as `alpha` in + // `Sample` method. + // 'wor`: If `wor` is true, the samples are taken without replacement, and the + // scores are the inclusion probabilities of the elements in the sample; + // otherwise the samples are taken with replacement and the scores are the + // log-probs of sample elements + // `include_best`: If `include_best` is true, the best tokenisation is always + // included in the sample, and the remaining elements are sampled excluding + // the best. + virtual util::Status SampleEncodeAndScore( + absl::string_view input, int num_samples, float alpha, bool wor, + bool include_best, + std::vector, float>> *pieces) const; + + // Same as above, but returns a sequence of ids. + virtual util::Status SampleEncodeAndScore( + absl::string_view input, int num_samples, float alpha, bool wor, + bool include_best, + std::vector, float>> *ids) const; + + ////////////////////////////////////////////////////////////// + // Entropy API. + // + // This only available in model_type=unigram. + // Calculate entropy of possible tokenisations + virtual util::Status CalculateEntropy(absl::string_view input, float alpha, + float *entropy) const; + + ////////////////////////////////////////////////////////////// + // Advanced API returning SentencePieceText, which manages + // utf8-byte alignments between user-input/detokenized text + // and internal sentencepiece sequence. + // + // Given a UTF8 input, encodes it into SentencePieceText. + // + // When using these APIs, sentencepiece.pb.h header files must be included. + // We can also use ImutableSentencePieceText as follows. + // + // ImmutableSentencePieceText spt; + // Encode("hello", spt.mutable_proto()).IgnoreError(); + // std::cout << spt.pieces_size() << std::endl; + virtual util::Status Encode(absl::string_view input, + SentencePieceText *spt) const; + + virtual util::Status NBestEncode(absl::string_view input, int nbest_size, + NBestSentencePieceText *nbest_spt) const; + + virtual util::Status SampleEncode(absl::string_view input, int nbest_size, + float alpha, SentencePieceText *spt) const; + + virtual util::Status SampleEncodeAndScore( + absl::string_view input, int num_samples, float alpha, bool wor, + bool include_best, NBestSentencePieceText *samples_spt) const; + + // DEPRECATED: Remove this API and use std::vector + virtual util::Status Decode(const std::vector &pieces, + SentencePieceText *spt) const; + + virtual util::Status Decode(const std::vector &pieces, + SentencePieceText *spt) const; + + virtual util::Status Decode(const std::vector &ids, + SentencePieceText *spt) const; +#ifdef SWIG +#define SPP_SWIG_CHECK_AND_THROW \ + if (!status.ok()) throw status; +#else +#define SPP_SWIG_CHECK_AND_THROW \ + if (!status.ok()) { \ + } +#endif // SWIG + +#define DEFINE_SPP_DIRECT_FUNC_IMPL(FuncName, OutType, ...) \ + OutType output; \ + const auto status = FuncName(__VA_ARGS__, &output); \ + SPP_SWIG_CHECK_AND_THROW; \ + return output; + +#define DEFINE_SPP_SERIALIZED_PROTO_IMPL(FuncName, OutType, ...) \ + OutType output; \ + const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \ + SPP_SWIG_CHECK_AND_THROW; \ + return output.SerializeAsString(); + +#define DEFINE_SPP_IMMUTABLE_PROTO_IMPL(FuncName, OutType, ...) \ + OutType output; \ + const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \ + SPP_SWIG_CHECK_AND_THROW; \ + return output; + + ////////////////////////////////////////////////////////////// + // Handy methods that return the result directly. + // These functions ignore internal errors. + virtual std::vector EncodeAsPieces( + absl::string_view input) const { + DEFINE_SPP_DIRECT_FUNC_IMPL(Encode, std::vector, input); + } + + virtual std::vector EncodeAsIds(absl::string_view input) const { + DEFINE_SPP_DIRECT_FUNC_IMPL(Encode, std::vector, input); + } + + virtual std::vector> NBestEncodeAsPieces( + absl::string_view input, int nbest_size) const { + DEFINE_SPP_DIRECT_FUNC_IMPL( + NBestEncode, std::vector>, input, nbest_size); + } + + virtual std::vector> NBestEncodeAsIds( + absl::string_view input, int nbest_size) const { + DEFINE_SPP_DIRECT_FUNC_IMPL(NBestEncode, std::vector>, + input, nbest_size); + } + + virtual std::vector SampleEncodeAsPieces(absl::string_view input, + int nbest_size, + float alpha) const { + DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncode, std::vector, input, + nbest_size, alpha); + } + + virtual std::vector SampleEncodeAsIds(absl::string_view input, + int nbest_size, + float alpha) const { + DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncode, std::vector, input, + nbest_size, alpha); + } + + virtual std::vector, float>> + SampleEncodeAndScoreAsPieces(absl::string_view input, int num_samples, + float alpha, bool wor, bool include_best) const { + using _T = std::vector, float>>; + DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncodeAndScore, _T, input, num_samples, + alpha, wor, include_best); + } + + virtual std::vector, float>> + SampleEncodeAndScoreAsIds(absl::string_view input, int num_samples, + float alpha, bool wor, bool include_best) const { + using _T = std::vector, float>>; + DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncodeAndScore, _T, input, num_samples, + alpha, wor, include_best); + } + + // DEPRECATED: Remove this API and use std::vector + virtual std::string DecodePieces( + const std::vector &pieces) const { + DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, pieces); + } + + virtual std::string DecodePieces( + const std::vector &pieces) const { + DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, pieces); + } + + virtual std::string DecodeIds(const std::vector &ids) const { + DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, ids); + } + + virtual float CalculateEntropy(absl::string_view text, float alpha) const { + DEFINE_SPP_DIRECT_FUNC_IMPL(CalculateEntropy, float, text, alpha); + } + + ////////////////////////////////////////////////////////////// + // SerializedProto API. (DEPRECATED). Use ImmutableProto API. + // They are used in Python interface. Returns serialized proto. + // In python module, we can get access to the full Proto after + // deserialzing the returned byte sequence. + virtual util::bytes EncodeAsSerializedProto(absl::string_view input) const { + DEFINE_SPP_SERIALIZED_PROTO_IMPL(Encode, ImmutableSentencePieceText, input); + } + + virtual util::bytes SampleEncodeAsSerializedProto(absl::string_view input, + int nbest_size, + float alpha) const { + DEFINE_SPP_SERIALIZED_PROTO_IMPL(SampleEncode, ImmutableSentencePieceText, + input, nbest_size, alpha); + } + + virtual util::bytes NBestEncodeAsSerializedProto(absl::string_view input, + int nbest_size) const { + DEFINE_SPP_SERIALIZED_PROTO_IMPL( + NBestEncode, ImmutableNBestSentencePieceText, input, nbest_size); + } + + virtual util::bytes SampleEncodeAndScoreAsSerializedProto( + absl::string_view input, int num_samples, float alpha, bool wor, + bool include_best) const { + DEFINE_SPP_SERIALIZED_PROTO_IMPL(SampleEncodeAndScore, + ImmutableNBestSentencePieceText, input, + num_samples, alpha, wor, include_best); + } + + // TODO(taku): Remove this API and use std::vector + virtual util::bytes DecodePiecesAsSerializedProto( + const std::vector &pieces) const { + DEFINE_SPP_SERIALIZED_PROTO_IMPL(Decode, ImmutableSentencePieceText, + pieces); + } + + virtual util::bytes DecodePiecesAsSerializedProto( + const std::vector &pieces) const { + DEFINE_SPP_SERIALIZED_PROTO_IMPL(Decode, ImmutableSentencePieceText, + pieces); + } + + virtual util::bytes DecodeIdsAsSerializedProto( + const std::vector &ids) const { + DEFINE_SPP_SERIALIZED_PROTO_IMPL(Decode, ImmutableSentencePieceText, ids); + } + + ////////////////////////////////////////////////////////////// + // ImmutableProto API. + virtual ImmutableSentencePieceText EncodeAsImmutableProto( + absl::string_view input) const { + DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Encode, ImmutableSentencePieceText, input); + } + + virtual ImmutableSentencePieceText SampleEncodeAsImmutableProto( + absl::string_view input, int nbest_size, float alpha) const { + DEFINE_SPP_IMMUTABLE_PROTO_IMPL(SampleEncode, ImmutableSentencePieceText, + input, nbest_size, alpha); + } + + virtual ImmutableNBestSentencePieceText NBestEncodeAsImmutableProto( + absl::string_view input, int nbest_size) const { + DEFINE_SPP_IMMUTABLE_PROTO_IMPL( + NBestEncode, ImmutableNBestSentencePieceText, input, nbest_size); + } + + virtual ImmutableNBestSentencePieceText SampleEncodeAndScoreAsImmutableProto( + absl::string_view input, int num_samples, float alpha, bool wor, + bool include_best) const { + DEFINE_SPP_IMMUTABLE_PROTO_IMPL(SampleEncodeAndScore, + ImmutableNBestSentencePieceText, input, + num_samples, alpha, wor, include_best); + } + + // TODO(taku): Remove this API and use std::vector + virtual ImmutableSentencePieceText DecodePiecesAsImmutableProto( + const std::vector &pieces) const { + DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Decode, ImmutableSentencePieceText, pieces); + } + + virtual ImmutableSentencePieceText DecodePiecesAsImmutableProto( + const std::vector &pieces) const { + DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Decode, ImmutableSentencePieceText, pieces); + } + + virtual ImmutableSentencePieceText DecodeIdsAsImmutableProto( + const std::vector &ids) const { + DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Decode, ImmutableSentencePieceText, ids); + } + +#undef DEFINE_SPP_DIRECT_FUNC_IMPL +#undef DEFINE_SPP_SERIALIZED_PROTO_IMPL +#undef DEFINE_SPP_IMMUTABLE_PROTO_IMPL + + ////////////////////////////////////////////////////////////// + // Vocabulary management methods. + // + // Returns the size of sentence pieces, which is the same as + // the size of vocabulary for NMT. + virtual int GetPieceSize() const; + + // Returns the vocab id of `piece`. + // Returns UNK(0) if `piece` is unknown. + virtual int PieceToId(absl::string_view piece) const; + + // Returns the string representation of vocab with `id`. + virtual const std::string &IdToPiece(int id) const; + + // Returns the score of `id`. + // Usually score is an emission log probability of unigram language + // model. + virtual float GetScore(int id) const; + + // Returns true if `id` is unknown symbol. + virtual bool IsUnknown(int id) const; + + // Returns true if `id` is control symbol. + virtual bool IsControl(int id) const; + + // Returns true if `id` is unused symbol. + virtual bool IsUnused(int id) const; + + // Returns true if `id` is byte symbol. + virtual bool IsByte(int id) const; + + // Returns the reserved id. + // Returns -1 if not defined. + + // Returns unknown () id. + virtual int unk_id() const; + + // Returns BOS () id. + virtual int bos_id() const; + + // Returns EOS () id. + virtual int eos_id() const; + + // Returns PAD () id. + virtual int pad_id() const; + + ////////////////////////////////////////////////////////////// + // Model management. + // + // Allows injection of a mock model instance. `model` is moved. + void SetModel(std::unique_ptr &&model); + + // Allows injection of a normalizer instance. `normalizer` is moved. + void SetNormalizer(std::unique_ptr &&normalizer); + + // Returns immutable model proto. Useful to obtain extended + // or experimental parameters encoded in model_proto. + const ModelProto &model_proto() const; + + // returns immutable model proto as std::string. + // Useful to save the state of this instance via Python's pickle object. + util::bytes serialized_model_proto() const; + + private: + enum ExtraOption { REVERSE, BOS, EOS, UNK_PIECE }; + + util::Status ParseExtraOptions(absl::string_view extra_option, + std::vector *extra_options) const; + + util::Status ApplyExtraOptions(const std::vector &extra_options, + SentencePieceText *spt) const; + + util::Status PopulateSentencePieceText( + absl::string_view input, absl::string_view normalized, + const std::vector &norm_to_orig, + const std::vector> &result, + SentencePieceText *spt) const; + + std::unique_ptr model_; + std::unique_ptr normalizer_; + std::unique_ptr denormalizer_; + + // Underlying model protocol buffer. The same lifetime as model_. + std::unique_ptr model_proto_; + + std::vector encode_extra_options_; + std::vector decode_extra_options_; +}; + +// Set seed value of random generator. +// Do not set static_cast(-1), +// as this seed is reserved for initializing from +// std::random_device. +void SetRandomGeneratorSeed(unsigned int seed); + +// IO related functions to absorb model formats. +namespace io { +// Loads `model_proto` from `filename`. +// We can instantiate SentencePieceProcessor as follows: +// +// auto model_proto = absl::make_unique(); +// io::LoadModelProto("//path/spm.model", model_proto.get()); +// SentencePieceProcessor sp; +// CHECK_OK(sp.Load(std::move(model_proto))); +util::Status LoadModelProto(absl::string_view, ModelProto *model_proto); + +// Saves `model_proto` as `filename`. +util::Status SaveModelProto(absl::string_view, const ModelProto &model_proto); +} // namespace io +} // namespace sentencepiece +#endif // SENTENCEPIECE_PROCESSOR_H_ diff --git a/models/Baichuan2/src/lib_pcie/libbmlib.so b/models/Baichuan2/src/lib_pcie/libbmlib.so new file mode 100644 index 0000000..7f9a95f Binary files /dev/null and b/models/Baichuan2/src/lib_pcie/libbmlib.so differ diff --git a/models/Baichuan2/src/lib_pcie/libbmrt.so b/models/Baichuan2/src/lib_pcie/libbmrt.so new file mode 100644 index 0000000..137929f Binary files /dev/null and b/models/Baichuan2/src/lib_pcie/libbmrt.so differ diff --git a/models/Baichuan2/src/lib_pcie/libbmrt.so.1.0 b/models/Baichuan2/src/lib_pcie/libbmrt.so.1.0 new file mode 100644 index 0000000..137929f Binary files /dev/null and b/models/Baichuan2/src/lib_pcie/libbmrt.so.1.0 differ diff --git a/models/Baichuan2/src/lib_pcie/libsentencepiece.a b/models/Baichuan2/src/lib_pcie/libsentencepiece.a new file mode 100644 index 0000000..7c17fa2 Binary files /dev/null and b/models/Baichuan2/src/lib_pcie/libsentencepiece.a differ diff --git a/models/Baichuan2/src/lib_soc/libbmlib.so b/models/Baichuan2/src/lib_soc/libbmlib.so new file mode 100644 index 0000000..81c75c1 Binary files /dev/null and b/models/Baichuan2/src/lib_soc/libbmlib.so differ diff --git a/models/Baichuan2/src/lib_soc/libbmrt.so b/models/Baichuan2/src/lib_soc/libbmrt.so new file mode 100644 index 0000000..d182777 Binary files /dev/null and b/models/Baichuan2/src/lib_soc/libbmrt.so differ diff --git a/models/Baichuan2/src/lib_soc/libbmrt.so.1.0 b/models/Baichuan2/src/lib_soc/libbmrt.so.1.0 new file mode 100644 index 0000000..d182777 Binary files /dev/null and b/models/Baichuan2/src/lib_soc/libbmrt.so.1.0 differ diff --git a/models/Baichuan2/src/lib_soc/libsentencepiece.a b/models/Baichuan2/src/lib_soc/libsentencepiece.a new file mode 100644 index 0000000..39debcd Binary files /dev/null and b/models/Baichuan2/src/lib_soc/libsentencepiece.a differ diff --git a/models/Baichuan2/src/tokenizer.model b/models/Baichuan2/src/tokenizer.model new file mode 100644 index 0000000..4348df0 Binary files /dev/null and b/models/Baichuan2/src/tokenizer.model differ diff --git a/models/Baichuan2/web_demo/CMakeLists.txt b/models/Baichuan2/web_demo/CMakeLists.txt new file mode 100755 index 0000000..119534c --- /dev/null +++ b/models/Baichuan2/web_demo/CMakeLists.txt @@ -0,0 +1,36 @@ +cmake_minimum_required(VERSION 2.8) +project(baichuan2) + +if (NOT DEFINED TARGET_ARCH) + set(TARGET_ARCH pcie) +endif() + +set(CMAKE_INSTALL_PREFIX install) + +if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "aarch64") + add_definitions(-DSOC_TARGET) + link_directories(${PROJECT_SOURCE_DIR}/../src/lib_soc) + message("SoC mode, starting......") +elseif (${TARGET_ARCH} STREQUAL "pcie") + add_definitions(-DPCIE_TARGET) + link_directories(${PROJECT_SOURCE_DIR}/../src/lib_pcie) + message("Pcie mode, starting......") +elseif (${TARGET_ARCH} STREQUAL "soc") + add_definitions(-DSOC_TARGET) + set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc) + set(CMAKE_ASM_COMPILER aarch64-linux-gnu-gcc) + set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++) + link_directories(${PROJECT_SOURCE_DIR}/lib_soc) + message("SoC mode, starting......") +endif() + + + + +include_directories(${PROJECT_SOURCE_DIR}/../src/include) + +add_definitions(-DDEBUG --std=c++17 -fPIC -Wall -Werror) +set(CMAKE_BUILD_TYPE "Debug") + +add_library(tpuchat SHARED chat.cpp) +target_link_libraries(tpuchat bmrt bmlib sentencepiece) diff --git a/models/Baichuan2/web_demo/chat.cpp b/models/Baichuan2/web_demo/chat.cpp new file mode 100755 index 0000000..84724c6 --- /dev/null +++ b/models/Baichuan2/web_demo/chat.cpp @@ -0,0 +1,419 @@ +//===----------------------------------------------------------------------===// +// +// Copyright (C) 2023 Sophgo Technologies Inc. All rights reserved. +// +// TPU-MLIR is licensed under the 2-Clause BSD License except for the +// third-party components. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include "memory.h" +#include "sentencepiece/sentencepiece_processor.h" +#include "bmruntime_interface.h" +#include + +static const int NUM_LAYERS = 32; +static const int MAX_LEN = 512; +static const float ATTENTION_MASK = -1000.; + +static const std::string TOKENIZER_MODEL = "tokenizer.model"; + +// #define EXPORT_RESULTS +#ifdef EXPORT_RESULTS +#include "cnpy.h" +static cnpy::npz_t map; + +template +static void add_array(std::string name, bm_handle_t bm_handle, + const bm_device_mem_t &dst) { + std::vector data(dst.size / sizeof(T)); + bm_memcpy_d2s(bm_handle, data.data(), dst); + cnpy::npz_add_array(map, name, data); +} + +static void save_array(std::string filename) { + cnpy::npz_save_all(filename, map); +} +#endif + +class Baichuan2 { +public: + void init(int devid, const std::string model, const std::string tokenizer_path); + void chat(); + void deinit(); + std::string name; + std::string history = ""; + int round = 0; + int token_length; + int EOS; + std::string predict_next_token(); + std::string predict_first_token(const std::string &input_str); + +private: + int forward_first(std::vector &tokens); + int forward_next(); + void load_sentencepiece(const std::string &tokenizer_path); + +private: + std::vector handles; + bm_handle_t bm_handle; + void *p_bmrt; + sentencepiece::SentencePieceProcessor sentencepiece; + const bm_net_info_t *net_blocks[NUM_LAYERS]; + const bm_net_info_t *net_blocks_cache[NUM_LAYERS]; + const bm_net_info_t *net_embed; + const bm_net_info_t *net_lm; + bm_tensor_t inputs_embed_512, outputs_embed_512; + bm_tensor_t inputs_lm, outputs_lm; + bm_tensor_t inputs_pid, next_pid, inputs_attention, next_attention; + bm_tensor_t past_key[NUM_LAYERS], past_value[NUM_LAYERS]; + bm_tensor_t present_key[NUM_LAYERS], present_value[NUM_LAYERS]; + bm_tensor_t present_key_cache, present_value_cache; + std::string name_embed; + std::string name_lm; + std::string name_blocks[NUM_LAYERS]; + std::string name_blocks_cache[NUM_LAYERS]; +}; + +void Baichuan2::load_sentencepiece(const std::string &model) { + printf("Load %s ... ", model.c_str()); + auto status = sentencepiece.Load(model); + if (!status.ok()) { + std::cout << status.ToString() << std::endl; + exit(-1); + } + EOS = sentencepiece.eos_id(); + printf("Done!\n"); +} + +void Baichuan2::init(int devid, const std::string model, const std::string tokenizer_path) { + load_sentencepiece(tokenizer_path); + // request bm_handle + bm_status_t status = bm_dev_request(&bm_handle, devid); + assert(BM_SUCCESS == status); + + // create bmruntime + p_bmrt = bmrt_create(bm_handle); + assert(NULL != p_bmrt); + + // load bmodel by file + printf("Model[%s] loading ....\n", model.c_str()); + bool ret = bmrt_load_bmodel(p_bmrt, model.c_str()); + assert(true == ret); + printf("Done!\n"); + // net names + name_embed = "embedding"; + name_lm = "lm_head"; + for (int i = 0; i < NUM_LAYERS; i++) { + name_blocks[i] = "block_" + std::to_string(i); + name_blocks_cache[i] = "block_cache_" + std::to_string(i); + } + + // net infos + net_embed = bmrt_get_network_info(p_bmrt, name_embed.c_str()); + net_lm = bmrt_get_network_info(p_bmrt, name_lm.c_str()); + for (int i = 0; i < NUM_LAYERS; i++) { + net_blocks[i] = bmrt_get_network_info(p_bmrt, name_blocks[i].c_str()); + net_blocks_cache[i] = + bmrt_get_network_info(p_bmrt, name_blocks_cache[i].c_str()); + } + + // net device mem + ret = bmrt_tensor(&inputs_embed_512, p_bmrt, net_embed->input_dtypes[0], + net_embed->stages[1].input_shapes[0]); + assert(true == ret); + + ret = bmrt_tensor(&outputs_embed_512, p_bmrt, net_embed->output_dtypes[0], + net_embed->stages[1].output_shapes[0]); + assert(true == ret); + + ret = bmrt_tensor(&inputs_pid, p_bmrt, net_blocks[0]->input_dtypes[1], + net_blocks[0]->stages[0].input_shapes[1]); + assert(true == ret); + + ret = bmrt_tensor(&inputs_attention, p_bmrt, net_blocks[0]->input_dtypes[2], + net_blocks[0]->stages[0].input_shapes[2]); + assert(true == ret); + + ret = bmrt_tensor(&next_pid, p_bmrt, net_blocks_cache[0]->input_dtypes[1], + net_blocks_cache[0]->stages[0].input_shapes[1]); + assert(true == ret); + + ret = + bmrt_tensor(&next_attention, p_bmrt, net_blocks_cache[0]->input_dtypes[2], + net_blocks_cache[0]->stages[0].input_shapes[2]); + assert(true == ret); + + for (int i = 0; i < NUM_LAYERS; i++) { + ret = bmrt_tensor(&past_key[i], p_bmrt, net_blocks[0]->output_dtypes[1], + net_blocks[0]->stages[0].output_shapes[1]); + assert(true == ret); + ret = bmrt_tensor(&past_value[i], p_bmrt, net_blocks[0]->output_dtypes[2], + net_blocks[0]->stages[0].output_shapes[2]); + assert(true == ret); + ret = bmrt_tensor(&present_key[i], p_bmrt, net_blocks[0]->output_dtypes[1], + net_blocks[0]->stages[0].output_shapes[1]); + assert(true == ret); + ret = bmrt_tensor(&present_value[i], p_bmrt, net_blocks[0]->output_dtypes[2], + net_blocks[0]->stages[0].output_shapes[2]); + assert(true == ret); + } + ret = bmrt_tensor(&present_key_cache, p_bmrt, net_blocks_cache[0]->output_dtypes[1], + net_blocks_cache[0]->stages[0].output_shapes[1]); + assert(true == ret); + ret = bmrt_tensor(&present_value_cache, p_bmrt, net_blocks_cache[0]->output_dtypes[2], + net_blocks_cache[0]->stages[0].output_shapes[2]); + assert(true == ret); + + ret = bmrt_tensor(&inputs_lm, p_bmrt, net_lm->input_dtypes[0], + net_lm->stages[0].input_shapes[0]); + assert(true == ret); + ret = bmrt_tensor(&outputs_lm, p_bmrt, net_lm->output_dtypes[0], + net_lm->stages[0].output_shapes[0]); + assert(true == ret); +} + +void Baichuan2::deinit() { + bm_free_device(bm_handle, inputs_embed_512.device_mem); + bm_free_device(bm_handle, outputs_embed_512.device_mem); + bm_free_device(bm_handle, inputs_lm.device_mem); + bm_free_device(bm_handle, outputs_lm.device_mem); + bm_free_device(bm_handle, inputs_pid.device_mem); + bm_free_device(bm_handle, next_pid.device_mem); + bm_free_device(bm_handle, inputs_attention.device_mem); + bm_free_device(bm_handle, next_attention.device_mem); + bm_free_device(bm_handle, present_key_cache.device_mem); + bm_free_device(bm_handle, present_value_cache.device_mem); + for (int i = 0; i < NUM_LAYERS; i++) { + bm_free_device(bm_handle, past_key[i].device_mem); + bm_free_device(bm_handle, past_value[i].device_mem); + bm_free_device(bm_handle, present_key[i].device_mem); + bm_free_device(bm_handle, present_value[i].device_mem); + } + bmrt_destroy(p_bmrt); + for (auto h : handles) { + bm_dev_free(h); + } +} + + + +int Baichuan2::forward_first(std::vector &tokens) { + int input_ids[MAX_LEN] = {0}; // start token + int position_id[MAX_LEN] = {0}; + float attention_mask[MAX_LEN * MAX_LEN] = {0}; + token_length = tokens.size(); + + std::copy(tokens.begin(), tokens.end(), input_ids); + for (int i = 0; i < token_length; i++) { + position_id[i] = i; + } + + for (int i = 0; i < MAX_LEN; i++) { + for (int j = 0; j < MAX_LEN; j++) { + if (j <= i && i < token_length) { + } else { + attention_mask[i * MAX_LEN + j] = ATTENTION_MASK; + } + } + } + + // forward embeding + bm_memcpy_s2d(bm_handle, inputs_embed_512.device_mem, (void *)input_ids); + auto ret = + bmrt_launch_tensor_ex(p_bmrt, name_embed.c_str(), &inputs_embed_512, 1, + &outputs_embed_512, 1, true, false); + assert(ret); + bm_thread_sync(bm_handle); + + // forward blocks + bm_memcpy_s2d(bm_handle, inputs_pid.device_mem, (void *)position_id); + bm_memcpy_s2d(bm_handle, inputs_attention.device_mem, (void *)attention_mask); + auto inputs_embed = outputs_embed_512; + inputs_embed.shape = net_blocks[0]->stages[0].input_shapes[0]; + bm_tensor_t inputs_block[3] = {inputs_embed, inputs_pid, inputs_attention}; + for (int i = 0; i < NUM_LAYERS; i++) { + bm_tensor_t outputs_block[3] = {inputs_embed, past_key[i], past_value[i]}; + ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks[i].c_str(), inputs_block, 3, + outputs_block, 3, true, false); + assert(ret); + bm_thread_sync(bm_handle); + } + int bytes = inputs_embed.device_mem.size / MAX_LEN; + bm_memcpy_d2d_byte(bm_handle, inputs_lm.device_mem, 0, + inputs_embed.device_mem, (token_length - 1) * bytes, + bytes); + ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm, 1, + &outputs_lm, 1, true, false); + bm_thread_sync(bm_handle); + + int token = 0; + bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm.device_mem); + return token; +} + +int Baichuan2::forward_next() { + float attention_mask[MAX_LEN + 1] = {0}; + for (int i = token_length - 1; i < MAX_LEN; i++) { + attention_mask[i] = ATTENTION_MASK; + } + int32_t position_id = token_length - 1; + // embedding + outputs_lm.shape = net_embed->stages[0].input_shapes[0]; + auto ret = bmrt_launch_tensor_ex(p_bmrt, name_embed.c_str(), &outputs_lm, 1, + &inputs_lm, 1, true, false); + assert(ret); + bm_thread_sync(bm_handle); + + // blocks + bm_memcpy_s2d(bm_handle, next_attention.device_mem, (void *)attention_mask); + bm_memcpy_s2d(bm_handle, next_pid.device_mem, (void *)&position_id); + auto inputs_embed = inputs_lm; + inputs_embed.shape = net_blocks_cache[0]->stages[0].input_shapes[0]; + int bytes = bm_mem_get_device_size(present_key_cache.device_mem); + int token_offset = (token_length - 1) * bytes; + for (int i = 0; i < NUM_LAYERS; i++) { + bm_tensor_t inputs_block[5] = {inputs_embed, next_pid, next_attention, + past_key[i], past_value[i]}; + bm_tensor_t outputs_block[3] = {inputs_embed, present_key_cache, present_value_cache}; + ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks_cache[i].c_str(), + inputs_block, 5, outputs_block, 3, true, false); + assert(ret); + bm_thread_sync(bm_handle); + bm_memcpy_d2d_byte(bm_handle, past_key[i].device_mem, token_offset, + present_key_cache.device_mem, 0, + bytes); + bm_memcpy_d2d_byte(bm_handle, past_value[i].device_mem, token_offset, + present_value_cache.device_mem, 0, + bytes); + } + outputs_lm.shape = net_lm->stages[0].output_shapes[0]; + ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm, 1, + &outputs_lm, 1, true, false); + bm_thread_sync(bm_handle); + + int token = 0; + bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm.device_mem); + return token; +} + + +std::string Baichuan2::predict_first_token(const std::string &input_str) { + history = input_str; + //int tok_num = 1; + std::vector tokens; + sentencepiece.Encode(history, &tokens); + tokens.insert(tokens.begin(), 1); + if (tokens.empty()) { + round = 0; + history = "Sorry: your question is too wierd!!\n"; + return history; + } + // make sure token not too large + if (tokens.size() > MAX_LEN - 10) { + // reset + if (round == 0) { + history = "Error: your question is too large!\n"; + return history; + } + round = 0; + history = ""; + return predict_first_token(input_str); + } + int token = forward_first(tokens); + int pre_token = 0; + std::string pre_word; + std::string word; + std::vector pre_ids = {pre_token}; + std::vector ids = {pre_token,token}; + sentencepiece.Decode(pre_ids, &pre_word); + sentencepiece.Decode(ids, &word); + std::string diff = word.substr(pre_word.size()); +#ifdef PRINT + printf("token %d",token); + printf("diff %s",diff.c_str()); +#endif + history += diff; + if (token_length < MAX_LEN) { + token_length++; + } + return diff; +} + +std::string Baichuan2::predict_next_token() { + int pre_token; + pre_token = 0; + int token = forward_next(); + if(token == EOS){ + round = 0; + history = history.substr(history.size()/2); + return "_GETEOS_"; + } + std::string pre_word; + std::string word; + std::vector pre_ids = {pre_token}; + std::vector ids = {pre_token, token}; + sentencepiece.Decode(pre_ids, &pre_word); + sentencepiece.Decode(ids, &word); + std::string diff = word.substr(pre_word.size()); +#ifdef PRINT + printf("token %d",token); + printf("diff %s",diff.c_str()); +#endif + history += diff; + if (token_length < MAX_LEN) { + token_length++; + }else{ + round = 0; + return "_GETMAX_"; + } + return diff; +} + + +extern "C" { + + +Baichuan2 *Baichuan2_with_devid_and_model(int devid, const char *bmodel_path, const char *tokenizer_path) { + Baichuan2 *chat = new Baichuan2(); + chat->init(devid, bmodel_path, tokenizer_path); + return chat; +} + +void Baichuan2_delete(Baichuan2 *chat) { delete chat; } + +void Baichuan2_deinit(Baichuan2 *chat) { + chat->deinit(); +} + +const char *get_history(Baichuan2 *chat) { + std::string str = chat->history; + return strdup(str.c_str()); +} + +const char *set_history(Baichuan2 *chat, const char *history) { + chat->history = history; + return strdup(history); +} + +const char *Baichuan2_predict_first_token(Baichuan2 *chat, const char *input_str) { + std::string str = chat->predict_first_token(input_str); + return strdup(str.c_str()); +} + +const char *Baichuan2_predict_next_token(Baichuan2 *chat) { + std::string str = chat->predict_next_token(); + return strdup(str.c_str()); +} + +const int get_eos(Baichuan2 *chat){ + const int res = chat->EOS; + return res; +} +} diff --git a/models/Baichuan2/web_demo/chat.py b/models/Baichuan2/web_demo/chat.py new file mode 100755 index 0000000..804bce8 --- /dev/null +++ b/models/Baichuan2/web_demo/chat.py @@ -0,0 +1,97 @@ +# coding=utf-8 + +import ctypes + + +class TokenWord(ctypes.Structure): + _fields_ = [ + ("token", ctypes.c_int), + ("word", ctypes.c_char * 2048) # 假设最大长度为 100,你可以根据实际情况调整 + ] + + +class TPUChatglm: + def __init__(self): + self.lib = ctypes.cdll.LoadLibrary('./build/libtpuchat.so') + device_id = 3 + bmodel_path = "../model/baichuan2-7b-test_int8.bmodel" + token_path = "../model/tokenizer.model" + self.device_id = device_id + self.bmodel_path = bmodel_path + self.token_path = token_path + self.libset() + self.init() + + def libset(self): + self.lib.Baichuan2_with_devid_and_model.argtypes = [ctypes.c_int, ctypes.c_char_p, ctypes.c_char_p] + self.lib.Baichuan2_with_devid_and_model.restype = ctypes.c_void_p + + self.lib.Baichuan2_delete.argtypes = [ctypes.c_void_p] + + # deinit + self.lib.Baichuan2_deinit.argtypes = [ctypes.c_void_p] + + # Baichuan2_predict_first_token + self.lib.Baichuan2_predict_first_token.argtypes = [ctypes.c_void_p, ctypes.c_char_p] + self.lib.Baichuan2_predict_first_token.restype = ctypes.c_char_p + + # Baichuan2_predict_next_token + self.lib.Baichuan2_predict_next_token.argtypes = [ctypes.c_void_p] + self.lib.Baichuan2_predict_next_token.restype = ctypes.c_char_p + + # get_eos + self.lib.get_eos.argtypes = [ctypes.c_void_p] + self.lib.get_eos.restype = ctypes.c_int + # get_history + self.lib.get_history.argtypes = [ctypes.c_void_p] + self.lib.get_history.restype = ctypes.c_char_p + # set history + self.lib.set_history.argtypes = [ctypes.c_void_p, ctypes.c_char_p] + + def init(self): + self.obj = self.lib.Baichuan2_with_devid_and_model(self.device_id, self.bmodel_path.encode('utf-8'), + self.token_path.encode('utf-8')) + + def predict_first_token(self, context): + return self.lib.Baichuan2_predict_first_token(self.obj, context.encode('utf-8')).decode('utf-8') + + def predict_next_token(self): + return self.lib.Baichuan2_predict_next_token(self.obj).decode('utf-8') + + def predict(self, context): + + first_token = self.predict_first_token(context) + # print(first_token, end='') + res = '' + while True: + next_token = self.predict_next_token() + if next_token == '_GETMAX_' or next_token == '_GETEOS_': + # print(next_token) + break + # print(next_token, end='') + res += next_token + return res + + def stream_predict(self, query, history): + history.append((query, '')) + + prompt = '' + # for i, (old_query, response) in enumerate(history): + # prompt += "[Round {}]\n\n问:{}\n\n答:{}\n\n".format(i + 1, old_query, response) + # prompt += "[Round {}]\n\n问:{}\n\n答:".format(len(history) + 1, query) + prompt = "" + query + "" + + res = '' + first_token = self.predict_first_token(prompt) + res += first_token + + while True: + next_token = self.predict_next_token() + if next_token == '_GETMAX_' or next_token == '_GETEOS_': + break + res += next_token + history[-1] = (query, res) + yield res, history + + def get_config(self): + pass \ No newline at end of file diff --git a/models/Baichuan2/web_demo/web_demo.py b/models/Baichuan2/web_demo/web_demo.py new file mode 100755 index 0000000..1dc5ee2 --- /dev/null +++ b/models/Baichuan2/web_demo/web_demo.py @@ -0,0 +1,108 @@ +import time +import gradio as gr +import mdtex2html +from chat import TPUChatglm + + +def postprocess(self, y): + if y is None: + return [] + for i, (message, response) in enumerate(y): + y[i] = ( + None if message is None else mdtex2html.convert((message)), + None if response is None else mdtex2html.convert(response), + ) + return y + + +gr.Chatbot.postprocess = postprocess + +glm = TPUChatglm() + +def parse_text(text): + """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/""" + lines = text.split("\n") + lines = [line for line in lines if line != ""] + count = 0 + for i, line in enumerate(lines): + if "```" in line: + count += 1 + items = line.split('`') + if count % 2 == 1: + lines[i] = f'
'
+            else:
+                lines[i] = f'
' + else: + if i > 0: + if count % 2 == 1: + line = line.replace("`", "\`") + line = line.replace("<", "<") + line = line.replace(">", ">") + line = line.replace(" ", " ") + line = line.replace("*", "*") + line = line.replace("_", "_") + line = line.replace("-", "-") + line = line.replace(".", ".") + line = line.replace("!", "!") + line = line.replace("(", "(") + line = line.replace(")", ")") + line = line.replace("$", "$") + lines[i] = "
" + line + text = "".join(lines) + return text + + +def gen(input, history): + i = 0 + history.append((input, '')) + res = '' + while i < 10: + i += 1 + res += str(i) + time.sleep(0.05) + history[-1] = (input, res) + yield res, history + + +def predict(input, chatbot, max_length, top_p, temperature, history): + + chatbot.append((parse_text(input), "")) + for response, history in glm.stream_predict(input, history): + chatbot[-1] = (parse_text(input), parse_text(response)) + yield chatbot, history + + +def reset_user_input(): + return gr.update(value='') + + +def reset_state(): + return [], [], None + + +with gr.Blocks() as demo: + gr.HTML("""

Baichuan2-7B TPU

""") + + chatbot = gr.Chatbot() + with gr.Row(): + with gr.Column(scale=4): + with gr.Column(scale=12): + user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10).style( + container=False) + with gr.Column(min_width=32, scale=1): + submitBtn = gr.Button("Submit", variant="primary") + with gr.Column(scale=1): + emptyBtn = gr.Button("Clear History") + max_length = gr.Slider(0, 32768, value=8192, step=1.0, label="Maximum length", interactive=True) + top_p = gr.Slider(0, 1, value=0.8, step=0.01, label="Top P", interactive=True) + temperature = gr.Slider(0, 1, value=0.95, step=0.01, label="Temperature", interactive=True) + + history = gr.State([]) + + submitBtn.click(predict, [user_input, chatbot, max_length, top_p, temperature, history], + [chatbot, history], show_progress=True) + submitBtn.click(reset_user_input, [], [user_input]) + + emptyBtn.click(reset_state, outputs=[chatbot, history], show_progress=True) + +demo.queue().launch(share=True, server_name="0.0.0.0", inbrowser=True) \ No newline at end of file diff --git a/models/ChatGLM3/compile/compile.sh b/models/ChatGLM3/compile/compile.sh index d1fe154..6fd6028 100755 --- a/models/ChatGLM3/compile/compile.sh +++ b/models/ChatGLM3/compile/compile.sh @@ -96,6 +96,7 @@ model_deploy.py \ --quant_input \ --quant_output \ --chip bm1684x \ + $device_args \ --model embedding_cache.bmodel rm *.npz @@ -112,7 +113,7 @@ pushd $outdir model_transform.py \ --model_name lm_head \ - --model_def ../../lm_head.onnx \ + --model_def ../../onnx/lm_head.onnx \ --mlir lm_head.mlir model_deploy.py \ @@ -141,24 +142,28 @@ for ((i=0; i<=$num_layers; i++)); do model_transform.py \ --model_name block_$i \ - --model_def ../../block_$i.onnx \ + --model_def ../../onnx/block_$i.onnx \ --mlir block_$i.mlir model_deploy.py \ --mlir block_$i.mlir \ $quantize_args \ + --quant_input \ + --quant_output \ --chip bm1684x \ $device_args \ --model block_$i.bmodel model_transform.py \ --model_name block_cache_$i \ - --model_def ../../block_cache_$i.onnx \ + --model_def ../../onnx/block_cache_$i.onnx \ --mlir block_cache_$i.mlir model_deploy.py \ --mlir block_cache_$i.mlir \ $quantize_args \ + --quant_input \ + --quant_output \ --chip bm1684x \ $device_args \ --model block_cache_$i.bmodel diff --git a/models/ChatGLM3/compile/export_onnx.py b/models/ChatGLM3/compile/export_onnx.py index 56cb542..a23d33b 100755 --- a/models/ChatGLM3/compile/export_onnx.py +++ b/models/ChatGLM3/compile/export_onnx.py @@ -141,7 +141,8 @@ def convert_block_cache(layer_id): def convert_embedding(): model = Embedding() - torch.onnx.export(model, (torch.tensor([0, 1, 2, 3])), + input = torch.tensor([range(SEQ_LENGTH)]) + torch.onnx.export(model, (input), f'{folder}/embedding.onnx', verbose=False, input_names=['input_ids'], diff --git a/models/ChatGLM3/compile/files/chatglm3-6b/modeling_chatglm.py b/models/ChatGLM3/compile/files/chatglm3-6b/modeling_chatglm.py index 163d634..a970776 100755 --- a/models/ChatGLM3/compile/files/chatglm3-6b/modeling_chatglm.py +++ b/models/ChatGLM3/compile/files/chatglm3-6b/modeling_chatglm.py @@ -278,7 +278,7 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask): attention_mask.tril_() attention_mask = ~attention_mask if attention_mask is not None: - attention_scores = attention_scores + (attention_mask * -10000.0) + attention_scores = attention_scores + attention_mask #attention_scores = attention_scores.masked_fill(attention_mask, float("-inf")) attention_probs = F.softmax(attention_scores, dim=-1) attention_probs = attention_probs.type_as(value_layer) diff --git a/models/ChatGLM3/demo/CMakeLists.txt b/models/ChatGLM3/demo/CMakeLists.txt index a9c250b..e135a49 100755 --- a/models/ChatGLM3/demo/CMakeLists.txt +++ b/models/ChatGLM3/demo/CMakeLists.txt @@ -1,26 +1,28 @@ cmake_minimum_required(VERSION 2.8) project(chatglm) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON CACHE INTERNAL "") + if (NOT DEFINED TARGET_ARCH) set(TARGET_ARCH pcie) endif() -include_directories(${PROJECT_SOURCE_DIR}/../src/include) +include_directories(${PROJECT_SOURCE_DIR}/../support/include) if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "aarch64") add_definitions(-DSOC_TARGET) - link_directories(${PROJECT_SOURCE_DIR}/../src/lib_soc) + link_directories(${PROJECT_SOURCE_DIR}/../support/lib_soc) message("SoC mode, starting......") elseif (${TARGET_ARCH} STREQUAL "pcie") add_definitions(-DPCIE_TARGET) - link_directories(${PROJECT_SOURCE_DIR}/../src/lib_pcie) + link_directories(${PROJECT_SOURCE_DIR}/../support/lib_pcie) message("PCIE mode, starting......") elseif (${TARGET_ARCH} STREQUAL "soc") add_definitions(-DSOC_TARGET) set(CMAKE_C_COMPILER /opt/aarch64-linux-gnu-7.5.0/bin/aarch64-linux-gnu-gcc) set(CMAKE_ASM_COMPILER /opt/aarch64-linux-gnu-7.5.0/bin/aarch64-linux-gnu-gcc) set(CMAKE_CXX_COMPILER /opt/aarch64-linux-gnu-7.5.0/bin/aarch64-linux-gnu-g++) - link_directories(${PROJECT_SOURCE_DIR}/../src/lib_soc) + link_directories(${PROJECT_SOURCE_DIR}/../support/lib_soc) message("SoC mode, starting......") endif() @@ -28,4 +30,9 @@ add_definitions(-DDEBUG --std=c++17 -fPIC -Wall -Werror) set(CMAKE_BUILD_TYPE "Debug") add_executable(chatglm demo.cpp) -target_link_libraries(chatglm bmlib bmrt sentencepiece) +target_link_libraries(chatglm bmrt bmlib sentencepiece) + +if (${TARGET_ARCH} STREQUAL "pcie") + add_executable(chatglm_parallel demo_parallel.cpp) + target_link_libraries(chatglm_parallel bmrt bmlib sentencepiece) +endif() \ No newline at end of file diff --git a/models/ChatGLM3/demo/demo_parallel.cpp b/models/ChatGLM3/demo/demo_parallel.cpp new file mode 100755 index 0000000..671a0f7 --- /dev/null +++ b/models/ChatGLM3/demo/demo_parallel.cpp @@ -0,0 +1,615 @@ +//===----------------------------------------------------------------------===// +// +// Copyright (C) 2023 Sophgo Technologies Inc. All rights reserved. +// +// TPU-MLIR is licensed under the 2-Clause BSD License except for the +// third-party components. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include "memory.h" +#include "sentencepiece/sentencepiece_processor.h" +#include "bmruntime_interface.h" +#include +#include +#include + +static const uint16_t ATTENTION_MASK = 0xF0E2; + +class ChatGLM { +public: + void init(const std::vector &devid, std::string model_path, std::string tokenizer_path); + void chat(); + void deinit(); + +private: + void answer(const std::string &input_str); + void tokenizer_encode(const std::string &input_str, std::vector &tokens); + int forward_first(std::vector &tokens); + int forward_next(int cur_token); + void move2end(const bm_tensor_t &kv); + void load_sentencepiece(std::string tokenizer_path); + void build_system_prompt(); + +private: + std::vector handles; + bm_handle_t bm_handle; + void *p_bmrt; + sentencepiece::SentencePieceProcessor sentencepiece; + const bm_net_info_t *net_embed; + const bm_net_info_t *net_embed_cache; + const bm_net_info_t *net_lm; + std::vector net_blocks; + std::vector net_blocks_cache; + std::vector inputs_embed_512, outputs_embed_512; + std::vector inputs_pid, next_pid, inputs_attention, next_attention; + std::vector> past_key, past_value; + std::vector inputs_lm, outputs_lm; + std::string name_embed; + std::string name_embed_cache; + std::string name_lm; + std::vector name_blocks; + std::vector name_blocks_cache; + std::string system_string = + "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow " + "the user's instructions carefully. Respond using markdown."; + std::vector history_tokens; + std::vector head_prompt{64790, 64792, 64794, 30910, + 13}; // head + system id + \n + std::vector system_prompt; + + int device_num; + int round = 0; + int token_length; + int EOS; + int SEQLEN; + int NUM_LAYERS; +}; + +void ChatGLM::load_sentencepiece(std::string tokenizer_path) { + printf("Load %s ... ", tokenizer_path.c_str()); + auto status = sentencepiece.Load(tokenizer_path); + if (!status.ok()) { + std::cout << status.ToString() << std::endl; + exit(-1); + } + EOS = sentencepiece.eos_id(); + printf("Done!\n"); +} + +void ChatGLM::init(const std::vector &devices, std::string model_path, std::string tokenizer_path) { + device_num = devices.size(); + load_sentencepiece(tokenizer_path); + // request bm_handle + std::cout << "Device [ "; + for (auto d : devices) { + std::cout << d << " "; + } + std::cout << "] loading ....\n"; + for (auto d : devices) { + bm_handle_t h; + bm_status_t status = bm_dev_request(&h, d); + assert(BM_SUCCESS == status); + handles.push_back(h); + } + bm_handle = handles[0]; + + // decode system prompt + sentencepiece.Encode(system_string, &system_prompt); + + // create bmruntime +#ifdef SOC_TARGET + p_bmrt = bmrt_create(handles[0]); +#else + p_bmrt = bmrt_create_ex(handles.data(), handles.size()); +#endif + assert(NULL != p_bmrt); + + // load bmodel by file + printf("Model[%s] loading ....\n", model_path.c_str()); + bool ret = bmrt_load_bmodel(p_bmrt, model_path.c_str()); + assert(true == ret); + printf("Done!\n"); + + // set NUM_LAYERS + auto num_nets = bmrt_get_network_number(p_bmrt); + NUM_LAYERS = (num_nets - 2) / 2; + + // net names + name_embed = "embedding"; + name_embed_cache = "embedding_cache"; + name_lm = "lm_head"; + for (int i = 0; i < NUM_LAYERS; i++) { + name_blocks.emplace_back("block_" + std::to_string(i)); + name_blocks_cache.emplace_back("block_cache_" + std::to_string(i)); + } + + // net infos + net_embed = bmrt_get_network_info(p_bmrt, name_embed.c_str()); + net_embed_cache = bmrt_get_network_info(p_bmrt, name_embed_cache.c_str()); + net_lm = bmrt_get_network_info(p_bmrt, name_lm.c_str()); + for (int i = 0; i < NUM_LAYERS; i++) { + net_blocks.emplace_back( + bmrt_get_network_info(p_bmrt, name_blocks[i].c_str())); + net_blocks_cache.emplace_back( + bmrt_get_network_info(p_bmrt, name_blocks_cache[i].c_str())); + } + + // set SEQLEN + SEQLEN = net_embed->stages[0].input_shapes[0].dims[1]; + + // resize + net_blocks.resize(NUM_LAYERS); + net_blocks_cache.resize(NUM_LAYERS); + past_key.resize(NUM_LAYERS); + past_value.resize(NUM_LAYERS); + + // net device mem + inputs_embed_512.resize(net_embed->input_num); + for (int i = 0; i < device_num; ++i) { + ret = bmrt_tensor_ex(&inputs_embed_512[i], p_bmrt, + net_embed->input_loc_devices[i], + net_embed->input_dtypes[i], + net_embed->stages[0].input_shapes[i]); + assert(true == ret); + } + + outputs_embed_512.resize(net_embed->output_num); + for (int i = 0; i < device_num; ++i) { + ret = bmrt_tensor_ex(&outputs_embed_512[i], p_bmrt, + net_embed->output_loc_devices[i], + net_embed->output_dtypes[i], + net_embed->stages[0].output_shapes[i]); + assert(true == ret); + } + + inputs_pid.resize(device_num); + inputs_attention.resize(device_num); + int in_num = net_blocks[0]->input_num / device_num; + for (int i = 0; i < device_num; ++i) { + ret = bmrt_tensor_ex(&inputs_pid[i], p_bmrt, + net_blocks[0]->input_loc_devices[1 + i * in_num], + net_blocks[0]->input_dtypes[1 + i * in_num], + net_blocks[0]->stages[0].input_shapes[1 + i * in_num]); + assert(true == ret); + + ret = bmrt_tensor_ex(&inputs_attention[i], p_bmrt, + net_blocks[0]->input_loc_devices[2 + i * in_num], + net_blocks[0]->input_dtypes[2 + i * in_num], + net_blocks[0]->stages[0].input_shapes[2 + i * in_num]); + assert(true == ret); + } + + + next_pid.resize(device_num); + next_attention.resize(device_num); + int in_num_cache = net_blocks_cache[0]->input_num / device_num; + for (int i = 0; i < device_num; ++i) { + ret = bmrt_tensor_ex(&next_pid[i], p_bmrt, + net_blocks_cache[0]->input_loc_devices[1 + i * in_num_cache], + net_blocks_cache[0]->input_dtypes[1 + i * in_num_cache], + net_blocks_cache[0]->stages[0].input_shapes[1 + i * in_num_cache]); + assert(true == ret); + + ret = bmrt_tensor_ex(&next_attention[i], p_bmrt, + net_blocks_cache[0]->input_loc_devices[2 + i * in_num_cache], + net_blocks_cache[0]->input_dtypes[2 + i * in_num_cache], + net_blocks_cache[0]->stages[0].input_shapes[2 + i * in_num_cache]); + assert(true == ret); + } + + int out_num = net_blocks[0]->output_num / device_num; + for (int i = 0; i < NUM_LAYERS; i++) { + past_key[i].resize(device_num); + past_value[i].resize(device_num); + for (int j = 0; j < device_num; j++) { + ret = bmrt_tensor_ex(&past_key[i][j], p_bmrt, + net_blocks[0]->output_loc_devices[1 + j * out_num], + net_blocks[0]->output_dtypes[1 + j * out_num], + net_blocks[0]->stages[0].output_shapes[1 + j * out_num]); + assert(true == ret); + ret = bmrt_tensor_ex(&past_value[i][j], p_bmrt, + net_blocks[0]->output_loc_devices[2 + j * out_num], + net_blocks[0]->output_dtypes[2 + j * out_num], + net_blocks[0]->stages[0].output_shapes[2 + j * out_num]); + assert(true == ret); + } + } + + inputs_lm.resize(device_num); + outputs_lm.resize(device_num); + for (int i = 0; i < device_num; ++i) { + ret = bmrt_tensor_ex(&inputs_lm[i], p_bmrt, i, net_lm->input_dtypes[0], + net_lm->stages[0].input_shapes[0]); + assert(true == ret); + ret = bmrt_tensor_ex(&outputs_lm[i], p_bmrt, i, net_lm->output_dtypes[0], + net_lm->stages[0].output_shapes[0]); + assert(true == ret); + } +} + +void ChatGLM::deinit() { + for (int i = 0; i < device_num; ++i) { + bm_free_device(handles[i], inputs_embed_512[i].device_mem); + bm_free_device(handles[i], outputs_embed_512[i].device_mem); + bm_free_device(handles[i], inputs_pid[i].device_mem); + bm_free_device(handles[i], next_pid[i].device_mem); + bm_free_device(handles[i], inputs_attention[i].device_mem); + bm_free_device(handles[i], next_attention[i].device_mem); + bm_free_device(handles[i], inputs_lm[i].device_mem); + bm_free_device(handles[i], outputs_lm[i].device_mem); + } + for (int i = 0; i < NUM_LAYERS; i++) { + for (int j = 0; j < device_num; j++) { + bm_free_device(handles[j], past_key[i][j].device_mem); + bm_free_device(handles[j], past_value[i][j].device_mem); + } + } + bmrt_destroy(p_bmrt); + for (auto h : handles) { + bm_dev_free(h); + } +} + +// after first block, move real result to end of mem +void ChatGLM::move2end(const bm_tensor_t &kv) { + if (token_length >= SEQLEN) { + return; + } + auto total_size = bm_mem_get_device_size(kv.device_mem); + auto bytes = total_size / SEQLEN; + auto real_size = token_length * bytes; + auto mem = + bm_mem_from_device(bm_mem_get_device_addr(kv.device_mem), real_size); + auto buffer = new uint8_t[real_size]; + auto dst = new uint8_t[total_size]; + bm_memcpy_d2s(bm_handle, (void *)buffer, mem); + memset(dst, 0, total_size - real_size); + memcpy(dst + total_size - real_size, buffer, real_size); + bm_memcpy_s2d(bm_handle, kv.device_mem, (void *)dst); + delete[] buffer; + delete[] dst; +} + +int ChatGLM::forward_first(std::vector &tokens) { + std::vector input_ids(SEQLEN, 0); + std::vector position_id(SEQLEN, 0); + std::vector attention_mask(SEQLEN * SEQLEN, 0); + + input_ids[0] = 64790; + input_ids[1] = 64792; + std::copy(tokens.begin(), tokens.end(), input_ids.data() + 2); + + token_length = tokens.size() + 2; + for (int i = 0; i < token_length; i++) { + position_id[i] = i; + } + for (int i = 0; i < SEQLEN; i++) { + for (int j = 0; j < SEQLEN; j++) { + if (j <= i && i < token_length) { + } else { + attention_mask[i * SEQLEN + j] = ATTENTION_MASK; + } + } + } + + // forward embeding + std::vector input_nums(device_num, 1); + std::vector datas(device_num, (void*)input_ids.data()); + bmrt_memcpy_s2d_parallel(p_bmrt, inputs_embed_512.data(), datas.data(), + input_nums.data(), device_num); + auto ret = + bmrt_launch_tensor_ex(p_bmrt, name_embed.c_str(), + inputs_embed_512.data(), inputs_embed_512.size(), + outputs_embed_512.data(), outputs_embed_512.size(), + true, false); + assert(ret); + bm_thread_sync(bm_handle); + + // forward blocks + std::vector pos_id_datas(device_num, position_id.data()); + std::vector in_attn_datas(device_num, attention_mask.data()); + bmrt_memcpy_s2d_parallel(p_bmrt, inputs_pid.data(), pos_id_datas.data(), + input_nums.data(), device_num); + bmrt_memcpy_s2d_parallel(p_bmrt, inputs_attention.data(),in_attn_datas.data(), + input_nums.data(), device_num); + auto embed_512 = outputs_embed_512; + std::vector inputs_block; + std::vector outputs_block; + for (int i = 0; i < device_num; ++i) { + embed_512[i].shape = net_blocks[0]->stages[0].input_shapes[0]; + inputs_block.push_back(embed_512[i]); + inputs_block.push_back(inputs_pid[i]); + inputs_block.push_back(inputs_attention[i]); + outputs_block.push_back(embed_512[i]); + outputs_block.push_back(past_key[0][i]); + outputs_block.push_back(past_value[0][i]); + } + + for (int i = 0; i < NUM_LAYERS; i++) { + for (int j = 0; j < device_num; ++j) { + outputs_block[1 + j * 3] = past_key[i][j]; + outputs_block[2 + j * 3] = past_value[i][j]; + } + ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks[i].c_str(), + inputs_block.data(), inputs_block.size(), + outputs_block.data(), outputs_block.size(), + true, false); + assert(ret); + bm_thread_sync(bm_handle); + for (int j = 0; j < device_num; ++j) { + move2end(past_key[i][j]); + move2end(past_value[i][j]); + } + } + + // forward lmhead + int bytes = embed_512[0].device_mem.size / SEQLEN; + bm_memcpy_d2d_byte(bm_handle, inputs_lm[0].device_mem, 0, + embed_512[0].device_mem, (token_length - 1) * bytes, + bytes); + ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm[0], 1, + &outputs_lm[0], 1, true, false); + bm_thread_sync(bm_handle); + + int token = 0; + bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm[0].device_mem); + return token; +} + +int ChatGLM::forward_next(int cur_token) { + std::vector attention_mask(SEQLEN + 1, 0); + for (int i = 0; i <= SEQLEN - token_length; i++) { + attention_mask[i] = ATTENTION_MASK; + } + int32_t position_id = token_length - 1; + + // forward embedding + std::vector inputs_embed; + std::vector input_datas; + std::vector input_nums(device_num, 1); + for (int i = 0; i < device_num; ++i) { + inputs_embed.push_back(outputs_lm[i]); // token_id + inputs_embed[i].shape = net_embed_cache->stages[0].input_shapes[0]; + input_datas.push_back((void*)(&cur_token)); + } + bmrt_memcpy_s2d_parallel(p_bmrt, inputs_embed.data(), input_datas.data(), + input_nums.data(), device_num); + auto ret = bmrt_launch_tensor_ex(p_bmrt, name_embed_cache.c_str(), + inputs_embed.data(), inputs_embed.size(), + inputs_lm.data(), inputs_lm.size(), true, false); + assert(ret); + bm_thread_sync(bm_handle); + + // forward blocks + std::vector attn_datas(device_num, attention_mask.data()); + std::vector pid_datas(device_num, &position_id); + bmrt_memcpy_s2d_parallel(p_bmrt, next_attention.data(), attn_datas.data(), + input_nums.data(), device_num); + bmrt_memcpy_s2d_parallel(p_bmrt, next_pid.data(), pid_datas.data(), + input_nums.data(), device_num); + + // WARNING: make inputs_lm device_num + std::vector embed_1 = inputs_lm; + for (int i = 0; i < device_num; ++i) { + embed_1[i].shape = net_blocks_cache[0]->stages[0].input_shapes[0]; + } + std::vector inputs_block; + std::vector outputs_block; + for (int i = 0; i < device_num; ++i) { + inputs_block.push_back(embed_1[i]); + inputs_block.push_back(next_pid[i]); + inputs_block.push_back(next_attention[i]); + inputs_block.push_back(past_key[0][i]); + inputs_block.push_back(past_value[0][i]); + outputs_block.push_back(embed_1[i]); + outputs_block.push_back(past_key[0][i]); + outputs_block.push_back(past_value[0][i]); + } + + for (int i = 0; i < NUM_LAYERS; i++) { + for (int j = 0; j < device_num; ++j) { + inputs_block[3 + j * 5] = past_key[i][j]; + inputs_block[4 + j * 5] = past_value[i][j]; + outputs_block[1 + j * 3] = past_key[i][j]; + outputs_block[2 + j * 3] = past_value[i][j]; + } + ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks_cache[i].c_str(), + inputs_block.data(), inputs_block.size(), + outputs_block.data(), outputs_block.size(), + true, false); + assert(ret); + bm_thread_sync(bm_handle); + } + + // forward lmhead + ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm[0], 1, + &outputs_lm[0], 1, true, false); + assert(ret); + bm_thread_sync(bm_handle); + + int token = 0; + bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm[0].device_mem); + return token; +} + +void ChatGLM::build_system_prompt() { + history_tokens.clear(); + history_tokens.insert(history_tokens.end(), head_prompt.begin(), + head_prompt.end()); + history_tokens.insert(history_tokens.end(), system_prompt.begin(), + system_prompt.end()); +} + +void ChatGLM::chat() { + while (true) { + std::cout << "\nQuestion: "; + std::string input_str; + std::getline(std::cin, input_str); + if (input_str == "exit") { + break; + } + std::cout << "\nAnswer: " << std::flush; + answer(input_str); + std::cout << std::endl; + } +} + +void ChatGLM::answer(const std::string &input_str) { + // auto time_0 = std::chrono::system_clock::now(); + int tok_num = 0; + std::vector tokens; + std::vector prompt{64795, 30910, 13}; + sentencepiece.Encode(input_str, &tokens); + tokens.insert(tokens.begin(), prompt.begin(), prompt.end()); + tokens.push_back(64796); + if (history_tokens.size() == 0) { + build_system_prompt(); + } + history_tokens.insert(history_tokens.end(), tokens.begin(), tokens.end()); + + if (history_tokens.empty()) { + printf("Sorry: your question is too wierd!!\n"); + round = 0; + history_tokens.clear(); + return; + } + // make sure token not too large + if ((int)history_tokens.size() > SEQLEN - 10) { + // reset + history_tokens.clear(); + if (round == 0) { + printf("Error: your question is too large!\n"); + return; + } + round = 0; + answer(input_str); + return; + } + int pre_token = 0; + auto t0 = std::chrono::system_clock::now(); + int token = forward_first(history_tokens); + auto t1 = std::chrono::system_clock::now(); + while (token != EOS && token_length < SEQLEN) { + std::string pre_word; + std::string word; + std::vector pre_ids = {pre_token}; + std::vector ids = {pre_token, token}; + sentencepiece.Decode(pre_ids, &pre_word); + sentencepiece.Decode(ids, &word); + std::string diff = word.substr(pre_word.size()); + history_tokens.emplace_back(token); + std::cout << diff << std::flush; + if (token_length < SEQLEN) { + token_length++; + } + tok_num++; + token = forward_next(token); + } + auto t2 = std::chrono::system_clock::now(); + auto use0 = std::chrono::duration_cast(t1 - t0); + auto use1 = std::chrono::duration_cast(t2 - t1); + printf("\n\nfirst token latency: %f s", (use0.count() * 1e-6)); + printf("\nspeed: %f token/s\n", tok_num / (use1.count() * 1e-6)); + if (token_length >= SEQLEN) { + history_tokens.clear(); + round = 0; + } else { + round++; + } +} + +static void split(const std::string &s, const std::string &delim, + std::vector &ret) { + size_t last = 0; + size_t index = s.find_first_of(delim, last); + while (index != std::string::npos) { + ret.push_back(s.substr(last, index - last)); + last = index + 1; + index = s.find_first_of(delim, last); + } + if (last < s.length()) { + ret.push_back(s.substr(last)); + } +} + +static std::vector parseCascadeDevices(const std::string &str) { + std::vector devices; + std::vector sub_str; + split(str, ",", sub_str); + for (auto &s : sub_str) { + devices.push_back(std::atoi(s.c_str())); + } + return devices; +} + +void Usage() { + printf("Usage:\n" + " --help : Show help info.\n" + " --model : Set model path \n" + " --tokenizer : Set tokenizer path \n" + " --devid : Set devices to run for model, e.g. 1,2. if not " + "set, use 0\n"); +} + +void processArguments(int argc, char *argv[], std::string &model_path, std::string &tokenizer_path, + std::vector &devices) { + struct option longOptions[] = {{"model", required_argument, nullptr, 'm'}, + {"tokenizer", required_argument, nullptr, 't'}, + {"devid", required_argument, nullptr, 'd'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0}}; + + int optionIndex = 0; + int option; + + while ((option = getopt_long(argc, argv, "m:t:d:h:", longOptions, + &optionIndex)) != -1) { + switch (option) { + case 'm': + model_path = optarg; + break; + case 't': + tokenizer_path = optarg; + break; + case 'd': + devices = parseCascadeDevices(optarg); + break; + case 'h': + Usage(); + exit(EXIT_FAILURE); + case '?': + Usage(); + exit(EXIT_FAILURE); + default: + exit(EXIT_FAILURE); + } + } +} + +int main(int argc, char **argv) { + // set your bmodel path here + printf("Demo for ChatGLM in BM1684X, support ChatGLM1/2/3\n"); + std::string model_path; + std::string tokenizer_path; + std::vector devices = {0}; + processArguments(argc, argv, model_path, tokenizer_path, devices); + if (model_path.empty()) { + Usage(); + exit(EXIT_FAILURE); + } + + ChatGLM glm; + printf("Init Environment ...\n"); + glm.init(devices, model_path, tokenizer_path); + printf("==========================\n"); + glm.chat(); + glm.deinit(); + return 0; +} diff --git a/models/ChatGLM3/support/include/bmdef.h b/models/ChatGLM3/support/include/bmdef.h index 68f251e..d41a4b0 100755 --- a/models/ChatGLM3/support/include/bmdef.h +++ b/models/ChatGLM3/support/include/bmdef.h @@ -98,7 +98,9 @@ typedef struct bm_net_info_s { size_t* max_input_bytes; /* max_input_bytes[0]/ [1] / ... / [input_num-1] */ size_t* max_output_bytes; /* max_output_bytes[0] / [1] / ... / [output_num-1] */ int* input_zero_point; /* input_zero_point[0] / [1] / .../ [input_num-1] */ - int* output_zero_point; /* output_zero_point[0] / [1] / .../ [input_num-1] */ + int* output_zero_point; /* output_zero_point[0] / [1] / .../ [output_num-1] */ + int *input_loc_devices; /* input_loc_device[0] / [1] / .../ [input_num-1] */ + int *output_loc_devices; /* output_loc_device[0] / [1] / .../ [output_num-1] */ } bm_net_info_t; typedef struct api_info_s { diff --git a/models/ChatGLM3/support/include/bmlib_runtime.h b/models/ChatGLM3/support/include/bmlib_runtime.h index 60094e1..071cfe0 100755 --- a/models/ChatGLM3/support/include/bmlib_runtime.h +++ b/models/ChatGLM3/support/include/bmlib_runtime.h @@ -1628,6 +1628,7 @@ typedef struct bm_profile { unsigned long cdma_out_time; unsigned long cdma_out_counter; unsigned long tpu_process_time; + unsigned long tpu1_process_time; unsigned long sent_api_counter; unsigned long completed_api_counter; #else @@ -1636,6 +1637,7 @@ typedef struct bm_profile { unsigned long long cdma_out_time; unsigned long long cdma_out_counter; unsigned long long tpu_process_time; + unsigned long long tpu1_process_time; unsigned long long sent_api_counter; unsigned long long completed_api_counter; #endif diff --git a/models/ChatGLM3/support/include/bmruntime_interface.h b/models/ChatGLM3/support/include/bmruntime_interface.h index 54fd90d..cbf6964 100755 --- a/models/ChatGLM3/support/include/bmruntime_interface.h +++ b/models/ChatGLM3/support/include/bmruntime_interface.h @@ -59,6 +59,12 @@ it will alloc device mem to tensor->device_mem, so user should bmrt_free_device( tensor->device_mem) to free it.*/ DECL_EXPORT bool bmrt_tensor(bm_tensor_t* tensor, void* p_bmrt, bm_data_type_t dtype, bm_shape_t shape); +/* +fill a tensor with data type and shape, and st_mode = 0 as default. +tensor and p_bmrt should not be NULL, shape count should not be 0. +it will alloc device mem to tensor->device_mem on devid-th device.*/ +DECL_EXPORT bool bmrt_tensor_ex(bm_tensor_t* tensor, void* p_bmrt, int devid, bm_data_type_t dtype, bm_shape_t shape); + /* fill a tensor with device mem existed, tensor byte size should not large than device mem size */ DECL_EXPORT void bmrt_tensor_with_device(bm_tensor_t* tensor, bm_device_mem_t device_mem, bm_data_type_t dtype, bm_shape_t shape); @@ -345,6 +351,52 @@ DECL_EXPORT bool bmrt_launch_tensor_multi_cores( const int *core_list, int core_num); +/** + * @name bmrt_memcpy_s2d_parallel + * @brief To copy data from system memory to muti-devices memory in parallel + * @ingroup bmruntime + * + * This API only could be used when the p_bmrt is created with bmrt_create_ex on multi devices. + * After calling this API, datas[:tensor_num[0]] will be copied to the first device, and + * datas[tensor_num[0]:tensor_num[0]+tensor_num[1]] will be copied to the second device and so on. + * The process of copying data to different devices is done in parallel and to the same device is in sequence. + * + * @param [in] p_bmrt Bmruntime that had been created with multi bm_handles + * @param [in] tensors Array of tensors that will be copied to devices + * @param [in] datas Array of satas allocated in system memory + * @param [in] tensor_num Array of tensor_num that will be copied to each device + * @param [in] device_num Device number +*/ +DECL_EXPORT bool bmrt_memcpy_s2d_parallel( + void *p_bmrt, + bm_tensor_t tensors[], + void *datas[], + int tensor_num[], + int device_num); + +/** + * @name bmrt_memcpy_d2s_parallel + * @brief To copy data from muti-devices memory to system memory in parallel + * @ingroup bmruntime + * + * This API only could be used when the p_bmrt is created with bmrt_create_ex on multi devices. + * After calling this API, tensors on the first device will be copied to datas[:tensor_num[0]] , and + * tensors on the second device will be copied to datas[tensor_num[0]:tensor_num[0]+tensor_num[1]] and so on. + * The process of copying data from different devices is done in parallel and from the same device is in sequence. + * + * @param [in] p_bmrt Bmruntime that had been created with multi bm_handles + * @param [in] datas Array of satas allocated in system memory + * @param [in] tensors Array of tensors that will be copied from devices + * @param [in] tensor_num Array of tensor_num that will be copied from each device + * @param [in] device_num Device number +*/ +DECL_EXPORT bool bmrt_memcpy_d2s_parallel( + void *p_bmrt, + void *datas[], + bm_tensor_t tensors[], + int tensor_num[], + int device_num); + #if defined (__cplusplus) } #endif diff --git a/models/Llama2/demo/demo.cpp b/models/Llama2/demo/demo.cpp index 1f193c4..bc98465 100755 --- a/models/Llama2/demo/demo.cpp +++ b/models/Llama2/demo/demo.cpp @@ -73,7 +73,7 @@ void LLama2::load_sentencepiece(std::string tokenizer_path) { printf("Done!\n"); } -void LLama2::init(const std::vector &devices, std::string model, std::string tokenizer_path) { +void LLama2::init(const std::vector &devices, std::string model_path, std::string tokenizer_path) { // load tokenizer load_sentencepiece(tokenizer_path); @@ -97,8 +97,8 @@ void LLama2::init(const std::vector &devices, std::string model, std::strin assert(NULL != p_bmrt); // load bmodel by file - printf("Model[%s] loading ....\n", model.c_str()); - bool ret = bmrt_load_bmodel(p_bmrt, model.c_str()); + printf("Model[%s] loading ....\n", model_path.c_str()); + bool ret = bmrt_load_bmodel(p_bmrt, model_path.c_str()); assert(true == ret); printf("Done!\n"); diff --git a/models/Qwen/demo/demo_parallel.cpp b/models/Qwen/demo/demo_parallel.cpp index 3e405be..5ddd3e0 100755 --- a/models/Qwen/demo/demo_parallel.cpp +++ b/models/Qwen/demo/demo_parallel.cpp @@ -18,6 +18,8 @@ #include "bmruntime_interface.h" #include +static const uint16_t BF16_NEG_10000 = 0xC61C; // -9984 by bfloat16 + // #define EXPORT_RESULTS #ifdef EXPORT_RESULTS #include "cnpy.h" @@ -58,9 +60,6 @@ void dump_tensor(bm_handle_t bm_handle, bm_tensor_t &tensor) { ptr[0] = ptr[0]; } - -static const uint16_t BF16_NEG_10000 = 0xC61C; // -9984 by bfloat16 - static const std::string TOKENIZER_MODEL = "qwen.tiktoken"; class QwenChat { @@ -81,11 +80,11 @@ class QwenChat { std::vector handles; bm_handle_t bm_handle; void *p_bmrt; - std::vector net_blocks; - std::vector net_blocks_cache; const bm_net_info_t *net_embed; const bm_net_info_t *net_embed_cache; const bm_net_info_t *net_lm; + std::vector net_blocks; + std::vector net_blocks_cache; std::vector inputs_embed_512, outputs_embed_512; std::vector inputs_pid, next_pid, inputs_attention, next_attention; std::vector> past_keys, past_values; @@ -142,8 +141,8 @@ void QwenChat::init(const std::vector &devices, std::string model) { // embed, lm_head - name_embed = "embedding_1"; - name_embed_cache = "embedding_0"; + name_embed = "embedding"; + name_embed_cache = "embedding_cache"; name_lm = "lm_head"; net_embed = bmrt_get_network_info(p_bmrt, name_embed.c_str()); net_embed_cache = bmrt_get_network_info(p_bmrt, name_embed_cache.c_str()); @@ -287,26 +286,6 @@ void QwenChat::deinit() { } } -// after first block, move real result to end of mem -void QwenChat::move2end(const bm_tensor_t &kv) { - if (token_length >= SEQLEN) { - return; - } - auto total_size = bm_mem_get_device_size(kv.device_mem); - auto bytes = total_size / SEQLEN; - auto real_size = token_length * bytes; - auto mem = - bm_mem_from_device(bm_mem_get_device_addr(kv.device_mem), real_size); - auto buffer = new uint8_t[real_size]; - auto dst = new uint8_t[total_size]; - bm_memcpy_d2s(bm_handle, (void *)buffer, mem); - memset(dst, 0, total_size - real_size); - memcpy(dst + total_size - real_size, buffer, real_size); - bm_memcpy_s2d(bm_handle, kv.device_mem, (void *)dst); - delete[] buffer; - delete[] dst; -} - int QwenChat::forward_first(std::vector &tokens) { std::vector input_ids(SEQLEN, 0); std::vector position_id(SEQLEN, 0);