diff --git a/models/Qwen2/python_demo_parallel/README.md b/models/Qwen2/python_demo_parallel/README.md index b29f5fe..8f315e9 100755 --- a/models/Qwen2/python_demo_parallel/README.md +++ b/models/Qwen2/python_demo_parallel/README.md @@ -59,15 +59,15 @@ python3 -m dfss --url=open@sophgo.com:/ext_model_information/LLM/LLM-TPU/bmodels git submodule update --init cd python_demo_parallel -mkdir build && cd build -cmake .. && make -j8 -python3 pipeline.py --model_path ../compile/qwen2-7b_int4_seq8192_8dev.bmodel --tokenizer_path ../support/token_config/ --devid 0,1,2,3,4,5,6,7 +mkdir build +cd build && cmake .. && make -j8 && cp *cpython* .. && cd .. +python3 pipeline.py --model_path ./qwen2-7b_int4_seq8192_8dev_static.bmodel --tokenizer_path ../support/token_config/ --devid 0,1,2,3,4,5,6,7 ``` 运行web demo ```shell pip3 install gradio==3.39.0 mdtex2html==1.2.0 dfss -python3 web_demo.py --model_path ../compile/qwen2-7b_int4_seq8192_8dev.bmodel --tokenizer_path ../support/token_config/ --devid 0,1,2,3,4,5,6,7 +python3 web_demo.py --model_path ./qwen2-7b_int4_seq8192_8dev_static.bmodel --tokenizer_path ../support/token_config/ --devid 0,1,2,3,4,5,6,7 ``` ## 4. 常见问题 diff --git a/models/Qwen2/python_demo_parallel/pipeline.py b/models/Qwen2/python_demo_parallel/pipeline.py index d0abfc9..414d7cd 100755 --- a/models/Qwen2/python_demo_parallel/pipeline.py +++ b/models/Qwen2/python_demo_parallel/pipeline.py @@ -152,16 +152,25 @@ def _generate_predictions(self, tokens): Generate predictions for the given tokens. """ # First token + tok_num = 0 + first_start = time.time() next_token = self.model.forward_first(tokens) + first_end = time.time() output_tokens = [next_token] # Following tokens while True: next_token = self.model.forward_next(next_token) if next_token == self.EOS: + next_end = time.time() + first_duration = first_end - first_start + next_duration = next_end - first_end + tps = tok_num / next_duration + yield self.answer_cur + f"\n\nFTL: {first_duration:.3f} s\nTPS: {tps:.3f} token/s", self.history break output_tokens += [next_token] self.answer_cur = self.tokenizer.decode(output_tokens) + tok_num += 1 if self.model.token_length >= self.model.SEQLEN: self.update_history() yield self.answer_cur + "\n\n\nReached the maximum length; The history context has been cleared.", self.history @@ -169,8 +178,10 @@ def _generate_predictions(self, tokens): else: yield self.answer_cur, self.history - self.update_history() - + if self.enable_history: + self.update_history() + else: + self.clear() def main(args): model = Qwen2(args)