From eeab80d7040a230153128e8530a9ff64028fc502 Mon Sep 17 00:00:00 2001
From: "jason.mjx" <jason.mjx@alibaba-inc.com>
Date: Sun, 7 Apr 2024 14:12:21 +0800
Subject: [PATCH] fix the no event loop issue of code interpreter

---
 benchmark/README.md                  |  2 +-
 benchmark/config.py                  | 11 ++++--
 benchmark/metrics/visualization.py   | 55 ++++++++++++++++------------
 benchmark/models/__init__.py         |  2 +-
 benchmark/models/dashscope.py        | 14 +++++--
 benchmark/models/qwen.py             |  7 +++-
 benchmark/requirements.txt           |  2 +-
 qwen_agent/tools/code_interpreter.py | 52 ++++++++++++++++----------
 8 files changed, 90 insertions(+), 55 deletions(-)

diff --git a/benchmark/README.md b/benchmark/README.md
index 265b363..cb14b96 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -245,4 +245,4 @@ The inference_and_exec.py file contains the following configurable options:
 - `--eval-code-exec-only`: Only evaluate code executable rate
 - `--gen-exec-only`: Only generate and execuate code without calculating evaluation metrics.
 - `--gen-only`: Only generate without execuating code and calculating evaluation metrics.
-- `--vis-judger`: The model to judge the result correctness for `Visualization` task which can be one of `gpt-4-vision-preview`, `qwen-vl-chat`, `qwen-vl-plus`. It is set to `gpt-4-vision-preview` by default in the version 20231206, and `Qwen-vl-chat` has been deprecated.
\ No newline at end of file
+- `--vis-judger`: The model to judge the result correctness for `Visualization` task which can be one of `gpt-4-vision-preview`, `qwen-vl-chat`, `qwen-vl-plus`. It is set to `gpt-4-vision-preview` by default in the version 20231206, and `Qwen-vl-chat` has been deprecated.
diff --git a/benchmark/config.py b/benchmark/config.py
index 2c98965..aec30a4 100644
--- a/benchmark/config.py
+++ b/benchmark/config.py
@@ -1,6 +1,6 @@
 from parser import InternLMReActParser, ReActParser
 
-from models import LLM, QwenVL, Qwen, QwenDashscopeVLModel
+from models import LLM, Qwen, QwenDashscopeVLModel, QwenVL
 from prompt import InternLMReAct, LlamaReAct, QwenReAct
 
 react_prompt_map = {
@@ -15,7 +15,12 @@
     'internlm': InternLMReActParser,
 }
 
-model_map = {'qwen': Qwen, 'llama': LLM, 'internlm': LLM, 'qwen-vl-chat': QwenVL}
+model_map = {
+    'qwen': Qwen,
+    'llama': LLM,
+    'internlm': LLM,
+    'qwen-vl-chat': QwenVL
+}
 
 model_type_map = {
     'qwen-72b-chat': 'qwen',
@@ -59,7 +64,7 @@ def get_react_parser(model_name):
 
 
 def get_model(model_name):
-    if model_name in ["qwen-vl-plus"]:
+    if model_name in ['qwen-vl-plus']:
         return QwenDashscopeVLModel(model=model_name)
     model_path = model_path_map.get(model_name, None)
     model_cls = model_map.get(model_type_map[model_name], LLM)
diff --git a/benchmark/metrics/visualization.py b/benchmark/metrics/visualization.py
index f5066e3..983a6b6 100644
--- a/benchmark/metrics/visualization.py
+++ b/benchmark/metrics/visualization.py
@@ -1,7 +1,8 @@
+import base64
 import logging
 import os
 import re
-import base64
+
 import torch
 from config import get_model, get_react_parser
 from utils.data_utils import load_jsonl, save_jsonl
@@ -23,35 +24,38 @@
 
 
 def encode_image(image_path):
-    with open(image_path, "rb") as image_file:
+    with open(image_path, 'rb') as image_file:
         a = base64.b64encode(image_file.read()).decode('utf-8')
     return a
 
 
-def judger_model_inference(judger_model_name, judger_model, imgs=[], prompt=''):
-    output = ""
+def judger_model_inference(judger_model_name,
+                           judger_model,
+                           imgs=[],
+                           prompt=''):
+    output = ''
     if judger_model_name == 'gpt-4-vision-preview':
-        logging.warning("This is an example of `gpt-4-vision-preview`. "
-                        "Please set the API key and use according to your actual situation.")
+        logging.warning(
+            'This is an example of `gpt-4-vision-preview`. '
+            'Please set the API key and use according to your actual situation.'
+        )
         from openai import OpenAI
         client = OpenAI()
         content_list = []
-        content_list.append({"type": "text", "text": prompt})
+        content_list.append({'type': 'text', 'text': prompt})
         input_images = []
         for img in imgs:
             if 'http' not in img:
                 base64_image = encode_image(img)
-                img = f"data:image/jpeg;base64,{base64_image}"
-            input_images.append({"type": "image_url", 'image_url': img})
+                img = f'data:image/jpeg;base64,{base64_image}'
+            input_images.append({'type': 'image_url', 'image_url': img})
         content_list.extend(input_images)
         response = client.chat.completions.create(
-            model="gpt-4-vision-preview",
-            messages=[
-                {
-                    "role": "user",
-                    "content": content_list,
-                }
-            ],
+            model='gpt-4-vision-preview',
+            messages=[{
+                'role': 'user',
+                'content': content_list,
+            }],
             max_tokens=300,
         )
         output = response.choices[0]
@@ -59,7 +63,7 @@ def judger_model_inference(judger_model_name, judger_model, imgs=[], prompt=''):
         inputs = []
         for img in imgs:
             if 'http' not in img and judger_model_name == 'qwen-vl-plus':
-                img = "file://" + img
+                img = 'file://' + img
             inputs.append({'image': img})
         inputs.append({'text': prompt})
 
@@ -105,17 +109,21 @@ def check_images_observation(text, images, model_name):
 eval_visual_prompt = {'zh': EVAL_VISUAL_PROMPT_ZH, 'en': EVAL_VISUAL_PROMPT_EN}
 
 
-def eval_visualization_acc(output_fname, model_name, judger_model_name='gpt-4-vision-preview'):
+def eval_visualization_acc(output_fname,
+                           model_name,
+                           judger_model_name='gpt-4-vision-preview'):
     if judger_model_name == 'gpt-4-vision-preview':
         judger_model = None
     elif judger_model_name in ['qwen-vl-chat', 'qwen-vl-plus']:
         if judger_model_name == 'qwen-vl-chat':
-            logging.warning('In this benchmark of version 20231206, `Qwen-vl-chat` is no longer used as the '
-                            'evaluation model for `Visualization` task.. If you insist on using it, '
-                            'the evaluation results might differ from the official results.')
+            logging.warning(
+                'In this benchmark of version 20231206, `Qwen-vl-chat` is no longer used as the '
+                'evaluation model for `Visualization` task.. If you insist on using it, '
+                'the evaluation results might differ from the official results.'
+            )
         judger_model = get_model(judger_model_name)
     else:
-        raise Exception("Not supported judger model.")
+        raise Exception('Not supported judger model.')
 
     one_action, one_action_right = 0, 0
     zero_action, zero_action_right = 0, 0
@@ -139,7 +147,8 @@ def eval_visualization_acc(output_fname, model_name, judger_model_name='gpt-4-vi
                                                model_name):
             input_prompt = eval_visual_prompt[item.get('lang', 'en')]
             format_prompt = input_prompt.format(query=prompt)
-            output = judger_model_inference(judger_model_name, judger_model, images, format_prompt)
+            output = judger_model_inference(judger_model_name, judger_model,
+                                            images, format_prompt)
             if 'right' in output.lower():
                 item['vis_acc'] = True
                 if '<|im_end|>' in item['query']:
diff --git a/benchmark/models/__init__.py b/benchmark/models/__init__.py
index 1649074..b124c21 100644
--- a/benchmark/models/__init__.py
+++ b/benchmark/models/__init__.py
@@ -1,4 +1,4 @@
 from models.base import HFModel  # noqa
+from models.dashscope import QwenDashscopeVLModel
 from models.llm import LLM  # noqa
 from models.qwen import Qwen, QwenVL  # noqa
-from models.dashscope import QwenDashscopeVLModel
diff --git a/benchmark/models/dashscope.py b/benchmark/models/dashscope.py
index 6c112c3..5e4b496 100644
--- a/benchmark/models/dashscope.py
+++ b/benchmark/models/dashscope.py
@@ -1,13 +1,16 @@
 import logging
-from http import HTTPStatus
 import time
+from http import HTTPStatus
+
 import dashscope
 
 
 class QwenDashscopeVLModel(object):
+
     def __init__(self, model, api_key):
         self.model = model
-        dashscope.api_key = api_key.strip() or os.getenv('DASHSCOPE_API_KEY', default='')
+        dashscope.api_key = api_key.strip() or os.getenv('DASHSCOPE_API_KEY',
+                                                         default='')
         assert dashscope.api_key, 'DASHSCOPE_API_KEY is required.'
 
     def generate(self, prompt, stop_words=[]):
@@ -19,7 +22,10 @@ def generate(self, prompt, stop_words=[]):
         while count < MAX_TRY:
             response = dashscope.MultiModalConversation.call(
                 self.model,
-                messages=[{'role': 'user', 'content': prompt}],
+                messages=[{
+                    'role': 'user',
+                    'content': prompt
+                }],
                 top_p=0.01,
                 top_k=1,
             )
@@ -28,7 +34,7 @@ def generate(self, prompt, stop_words=[]):
                 for stop_str in stop_words:
                     idx = output.find(stop_str)
                     if idx != -1:
-                        output = output[: idx + len(stop_str)]
+                        output = output[:idx + len(stop_str)]
                 return output
             else:
                 err = 'Error code: %s, error message: %s' % (
diff --git a/benchmark/models/qwen.py b/benchmark/models/qwen.py
index 193208a..235feaf 100644
--- a/benchmark/models/qwen.py
+++ b/benchmark/models/qwen.py
@@ -26,11 +26,14 @@ def generate(self, input_text, stop_words=[]):
 
 
 class QwenVL(HFModel):
+
     def __init__(self, model_path):
         super().__init__(model_path)
 
     def generate(self, inputs: list):
         query = self.tokenizer.from_list_format(inputs)
-        response, _ = self.model.chat(self.tokenizer, query=query, history=None)
+        response, _ = self.model.chat(self.tokenizer,
+                                      query=query,
+                                      history=None)
 
-        return response
\ No newline at end of file
+        return response
diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt
index d8c77ca..748966e 100644
--- a/benchmark/requirements.txt
+++ b/benchmark/requirements.txt
@@ -3,6 +3,7 @@ func_timeout
 json5
 matplotlib
 numpy
+openai
 pandas
 PrettyTable
 scipy
@@ -10,4 +11,3 @@ seaborn
 sympy
 transformers==4.33.1
 transformers_stream_generator
-openai
\ No newline at end of file
diff --git a/qwen_agent/tools/code_interpreter.py b/qwen_agent/tools/code_interpreter.py
index f320f44..a85c74e 100644
--- a/qwen_agent/tools/code_interpreter.py
+++ b/qwen_agent/tools/code_interpreter.py
@@ -110,7 +110,7 @@ def _start_kernel(pid) -> BlockingKernelClient:
 
     # Client
     kc = BlockingKernelClient(connection_file=connection_file)
-    asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
+    asyncio.set_event_loop_policy(AnyThreadEventLoopPolicy())
     kc.load_connection_file()
     kc.start_channels()
     kc.wait_for_ready()
@@ -317,24 +317,36 @@ def call(self,
         return result if result.strip() else 'Finished execution.'
 
 
-def _get_multiline_input() -> str:
-    logger.info(
-        '// Press ENTER to make a new line. Press CTRL-D to end input.')
-    lines = []
-    while True:
-        try:
-            line = input()
-        except EOFError:  # CTRL-D
-            break
-        lines.append(line)
-    logger.info('// Input received.')
-    if lines:
-        return '\n'.join(lines)
-    else:
-        return ''
+#
+# The _BasePolicy and AnyThreadEventLoopPolicy below are borrowed from Tornado.
+# Ref: https://www.tornadoweb.org/en/stable/_modules/tornado/platform/asyncio.html#AnyThreadEventLoopPolicy
+#
 
+if sys.platform == 'win32' and hasattr(asyncio,
+                                       'WindowsSelectorEventLoopPolicy'):
+    _BasePolicy = asyncio.WindowsSelectorEventLoopPolicy  # type: ignore
+else:
+    _BasePolicy = asyncio.DefaultEventLoopPolicy
 
-if __name__ == '__main__':
-    tool = CodeInterpreter()
-    while True:
-        logger.info(tool.call(_get_multiline_input()))
+
+class AnyThreadEventLoopPolicy(_BasePolicy):  # type: ignore
+    """Event loop policy that allows loop creation on any thread.
+
+    The default `asyncio` event loop policy only automatically creates
+    event loops in the main threads. Other threads must create event
+    loops explicitly or `asyncio.get_event_loop` (and therefore
+    `.IOLoop.current`) will fail. Installing this policy allows event
+    loops to be created automatically on any thread.
+
+    Usage::
+        asyncio.set_event_loop_policy(AnyThreadEventLoopPolicy())
+    """
+
+    def get_event_loop(self) -> asyncio.AbstractEventLoop:
+        try:
+            return super().get_event_loop()
+        except RuntimeError:
+            # "There is no current event loop in thread %r"
+            loop = self.new_event_loop()
+            self.set_event_loop(loop)
+            return loop