Skip to content

Commit

Permalink
[Qwen2] show FTL & TPS in python_demo_parallel
Browse files Browse the repository at this point in the history
  • Loading branch information
chuxiaoyi2023 committed Aug 28, 2024
1 parent 65911d4 commit 87ce2e8
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 6 deletions.
8 changes: 4 additions & 4 deletions models/Qwen2/python_demo_parallel/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,15 @@ python3 -m dfss [email protected]:/ext_model_information/LLM/LLM-TPU/bmodels
git submodule update --init

cd python_demo_parallel
mkdir build && cd build
cmake .. && make -j8
python3 pipeline.py --model_path ../compile/qwen2-7b_int4_seq8192_8dev.bmodel --tokenizer_path ../support/token_config/ --devid 0,1,2,3,4,5,6,7
mkdir build
cd build && cmake .. && make -j8 && cp *cpython* .. && cd ..
python3 pipeline.py --model_path ./qwen2-7b_int4_seq8192_8dev_static.bmodel --tokenizer_path ../support/token_config/ --devid 0,1,2,3,4,5,6,7
```

运行web demo
```shell
pip3 install gradio==3.39.0 mdtex2html==1.2.0 dfss
python3 web_demo.py --model_path ../compile/qwen2-7b_int4_seq8192_8dev.bmodel --tokenizer_path ../support/token_config/ --devid 0,1,2,3,4,5,6,7
python3 web_demo.py --model_path ./qwen2-7b_int4_seq8192_8dev_static.bmodel --tokenizer_path ../support/token_config/ --devid 0,1,2,3,4,5,6,7
```

## 4. 常见问题
Expand Down
15 changes: 13 additions & 2 deletions models/Qwen2/python_demo_parallel/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,25 +152,36 @@ def _generate_predictions(self, tokens):
Generate predictions for the given tokens.
"""
# First token
tok_num = 0
first_start = time.time()
next_token = self.model.forward_first(tokens)
first_end = time.time()
output_tokens = [next_token]

# Following tokens
while True:
next_token = self.model.forward_next(next_token)
if next_token == self.EOS:
next_end = time.time()
first_duration = first_end - first_start
next_duration = next_end - first_end
tps = tok_num / next_duration
yield self.answer_cur + f"\n\nFTL: {first_duration:.3f} s\nTPS: {tps:.3f} token/s", self.history
break
output_tokens += [next_token]
self.answer_cur = self.tokenizer.decode(output_tokens)
tok_num += 1
if self.model.token_length >= self.model.SEQLEN:
self.update_history()
yield self.answer_cur + "\n\n\nReached the maximum length; The history context has been cleared.", self.history
break
else:
yield self.answer_cur, self.history

self.update_history()

if self.enable_history:
self.update_history()
else:
self.clear()

def main(args):
model = Qwen2(args)
Expand Down

0 comments on commit 87ce2e8

Please sign in to comment.