[Qwen2] show FTL & TPS in python_demo_parallel

zifeng-radxa · Aug 28, 2024 · 87ce2e8 · 87ce2e8
1 parent 65911d4
commit 87ce2e8
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 6 deletions.
diff --git a/models/Qwen2/python_demo_parallel/README.md b/models/Qwen2/python_demo_parallel/README.md
@@ -59,15 +59,15 @@ python3 -m dfss [email protected]:/ext_model_information/LLM/LLM-TPU/bmodels
 git submodule update --init
 
 cd python_demo_parallel
-mkdir build && cd build
-cmake .. && make -j8
-python3 pipeline.py --model_path ../compile/qwen2-7b_int4_seq8192_8dev.bmodel --tokenizer_path ../support/token_config/ --devid 0,1,2,3,4,5,6,7
+mkdir build 
+cd build && cmake .. && make -j8 && cp *cpython* .. && cd ..
+python3 pipeline.py --model_path ./qwen2-7b_int4_seq8192_8dev_static.bmodel --tokenizer_path ../support/token_config/ --devid 0,1,2,3,4,5,6,7
 ```
 
 运行web demo
 ```shell
 pip3 install gradio==3.39.0 mdtex2html==1.2.0 dfss
-python3 web_demo.py --model_path ../compile/qwen2-7b_int4_seq8192_8dev.bmodel --tokenizer_path ../support/token_config/ --devid 0,1,2,3,4,5,6,7
+python3 web_demo.py --model_path ./qwen2-7b_int4_seq8192_8dev_static.bmodel --tokenizer_path ../support/token_config/ --devid 0,1,2,3,4,5,6,7
 ```
 
 ## 4. 常见问题

diff --git a/models/Qwen2/python_demo_parallel/pipeline.py b/models/Qwen2/python_demo_parallel/pipeline.py
@@ -152,25 +152,36 @@ def _generate_predictions(self, tokens):
         Generate predictions for the given tokens.
         """
         # First token
+        tok_num = 0
+        first_start = time.time()
         next_token = self.model.forward_first(tokens)
+        first_end = time.time()
         output_tokens = [next_token]
 
         # Following tokens
         while True:
             next_token = self.model.forward_next(next_token)
             if next_token == self.EOS:
+                next_end = time.time()
+                first_duration = first_end - first_start
+                next_duration = next_end - first_end
+                tps = tok_num / next_duration
+                yield self.answer_cur + f"\n\nFTL: {first_duration:.3f} s\nTPS: {tps:.3f} token/s", self.history
                 break
             output_tokens += [next_token]
             self.answer_cur = self.tokenizer.decode(output_tokens)
+            tok_num += 1
             if self.model.token_length >= self.model.SEQLEN:
                 self.update_history()
                 yield self.answer_cur + "\n\n\nReached the maximum length; The history context has been cleared.", self.history
                 break
             else:
                 yield self.answer_cur, self.history
 
-        self.update_history()
-
+        if self.enable_history:
+            self.update_history()
+        else:
+            self.clear()
 
 def main(args):
     model = Qwen2(args)