From eeab80d7040a230153128e8530a9ff64028fc502 Mon Sep 17 00:00:00 2001 From: "jason.mjx" Date: Sun, 7 Apr 2024 14:12:21 +0800 Subject: [PATCH] fix the no event loop issue of code interpreter --- benchmark/README.md | 2 +- benchmark/config.py | 11 ++++-- benchmark/metrics/visualization.py | 55 ++++++++++++++++------------ benchmark/models/__init__.py | 2 +- benchmark/models/dashscope.py | 14 +++++-- benchmark/models/qwen.py | 7 +++- benchmark/requirements.txt | 2 +- qwen_agent/tools/code_interpreter.py | 52 ++++++++++++++++---------- 8 files changed, 90 insertions(+), 55 deletions(-) diff --git a/benchmark/README.md b/benchmark/README.md index 265b363..cb14b96 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -245,4 +245,4 @@ The inference_and_exec.py file contains the following configurable options: - `--eval-code-exec-only`: Only evaluate code executable rate - `--gen-exec-only`: Only generate and execuate code without calculating evaluation metrics. - `--gen-only`: Only generate without execuating code and calculating evaluation metrics. -- `--vis-judger`: The model to judge the result correctness for `Visualization` task which can be one of `gpt-4-vision-preview`, `qwen-vl-chat`, `qwen-vl-plus`. It is set to `gpt-4-vision-preview` by default in the version 20231206, and `Qwen-vl-chat` has been deprecated. \ No newline at end of file +- `--vis-judger`: The model to judge the result correctness for `Visualization` task which can be one of `gpt-4-vision-preview`, `qwen-vl-chat`, `qwen-vl-plus`. It is set to `gpt-4-vision-preview` by default in the version 20231206, and `Qwen-vl-chat` has been deprecated. diff --git a/benchmark/config.py b/benchmark/config.py index 2c98965..aec30a4 100644 --- a/benchmark/config.py +++ b/benchmark/config.py @@ -1,6 +1,6 @@ from parser import InternLMReActParser, ReActParser -from models import LLM, QwenVL, Qwen, QwenDashscopeVLModel +from models import LLM, Qwen, QwenDashscopeVLModel, QwenVL from prompt import InternLMReAct, LlamaReAct, QwenReAct react_prompt_map = { @@ -15,7 +15,12 @@ 'internlm': InternLMReActParser, } -model_map = {'qwen': Qwen, 'llama': LLM, 'internlm': LLM, 'qwen-vl-chat': QwenVL} +model_map = { + 'qwen': Qwen, + 'llama': LLM, + 'internlm': LLM, + 'qwen-vl-chat': QwenVL +} model_type_map = { 'qwen-72b-chat': 'qwen', @@ -59,7 +64,7 @@ def get_react_parser(model_name): def get_model(model_name): - if model_name in ["qwen-vl-plus"]: + if model_name in ['qwen-vl-plus']: return QwenDashscopeVLModel(model=model_name) model_path = model_path_map.get(model_name, None) model_cls = model_map.get(model_type_map[model_name], LLM) diff --git a/benchmark/metrics/visualization.py b/benchmark/metrics/visualization.py index f5066e3..983a6b6 100644 --- a/benchmark/metrics/visualization.py +++ b/benchmark/metrics/visualization.py @@ -1,7 +1,8 @@ +import base64 import logging import os import re -import base64 + import torch from config import get_model, get_react_parser from utils.data_utils import load_jsonl, save_jsonl @@ -23,35 +24,38 @@ def encode_image(image_path): - with open(image_path, "rb") as image_file: + with open(image_path, 'rb') as image_file: a = base64.b64encode(image_file.read()).decode('utf-8') return a -def judger_model_inference(judger_model_name, judger_model, imgs=[], prompt=''): - output = "" +def judger_model_inference(judger_model_name, + judger_model, + imgs=[], + prompt=''): + output = '' if judger_model_name == 'gpt-4-vision-preview': - logging.warning("This is an example of `gpt-4-vision-preview`. " - "Please set the API key and use according to your actual situation.") + logging.warning( + 'This is an example of `gpt-4-vision-preview`. ' + 'Please set the API key and use according to your actual situation.' + ) from openai import OpenAI client = OpenAI() content_list = [] - content_list.append({"type": "text", "text": prompt}) + content_list.append({'type': 'text', 'text': prompt}) input_images = [] for img in imgs: if 'http' not in img: base64_image = encode_image(img) - img = f"data:image/jpeg;base64,{base64_image}" - input_images.append({"type": "image_url", 'image_url': img}) + img = f'data:image/jpeg;base64,{base64_image}' + input_images.append({'type': 'image_url', 'image_url': img}) content_list.extend(input_images) response = client.chat.completions.create( - model="gpt-4-vision-preview", - messages=[ - { - "role": "user", - "content": content_list, - } - ], + model='gpt-4-vision-preview', + messages=[{ + 'role': 'user', + 'content': content_list, + }], max_tokens=300, ) output = response.choices[0] @@ -59,7 +63,7 @@ def judger_model_inference(judger_model_name, judger_model, imgs=[], prompt=''): inputs = [] for img in imgs: if 'http' not in img and judger_model_name == 'qwen-vl-plus': - img = "file://" + img + img = 'file://' + img inputs.append({'image': img}) inputs.append({'text': prompt}) @@ -105,17 +109,21 @@ def check_images_observation(text, images, model_name): eval_visual_prompt = {'zh': EVAL_VISUAL_PROMPT_ZH, 'en': EVAL_VISUAL_PROMPT_EN} -def eval_visualization_acc(output_fname, model_name, judger_model_name='gpt-4-vision-preview'): +def eval_visualization_acc(output_fname, + model_name, + judger_model_name='gpt-4-vision-preview'): if judger_model_name == 'gpt-4-vision-preview': judger_model = None elif judger_model_name in ['qwen-vl-chat', 'qwen-vl-plus']: if judger_model_name == 'qwen-vl-chat': - logging.warning('In this benchmark of version 20231206, `Qwen-vl-chat` is no longer used as the ' - 'evaluation model for `Visualization` task.. If you insist on using it, ' - 'the evaluation results might differ from the official results.') + logging.warning( + 'In this benchmark of version 20231206, `Qwen-vl-chat` is no longer used as the ' + 'evaluation model for `Visualization` task.. If you insist on using it, ' + 'the evaluation results might differ from the official results.' + ) judger_model = get_model(judger_model_name) else: - raise Exception("Not supported judger model.") + raise Exception('Not supported judger model.') one_action, one_action_right = 0, 0 zero_action, zero_action_right = 0, 0 @@ -139,7 +147,8 @@ def eval_visualization_acc(output_fname, model_name, judger_model_name='gpt-4-vi model_name): input_prompt = eval_visual_prompt[item.get('lang', 'en')] format_prompt = input_prompt.format(query=prompt) - output = judger_model_inference(judger_model_name, judger_model, images, format_prompt) + output = judger_model_inference(judger_model_name, judger_model, + images, format_prompt) if 'right' in output.lower(): item['vis_acc'] = True if '<|im_end|>' in item['query']: diff --git a/benchmark/models/__init__.py b/benchmark/models/__init__.py index 1649074..b124c21 100644 --- a/benchmark/models/__init__.py +++ b/benchmark/models/__init__.py @@ -1,4 +1,4 @@ from models.base import HFModel # noqa +from models.dashscope import QwenDashscopeVLModel from models.llm import LLM # noqa from models.qwen import Qwen, QwenVL # noqa -from models.dashscope import QwenDashscopeVLModel diff --git a/benchmark/models/dashscope.py b/benchmark/models/dashscope.py index 6c112c3..5e4b496 100644 --- a/benchmark/models/dashscope.py +++ b/benchmark/models/dashscope.py @@ -1,13 +1,16 @@ import logging -from http import HTTPStatus import time +from http import HTTPStatus + import dashscope class QwenDashscopeVLModel(object): + def __init__(self, model, api_key): self.model = model - dashscope.api_key = api_key.strip() or os.getenv('DASHSCOPE_API_KEY', default='') + dashscope.api_key = api_key.strip() or os.getenv('DASHSCOPE_API_KEY', + default='') assert dashscope.api_key, 'DASHSCOPE_API_KEY is required.' def generate(self, prompt, stop_words=[]): @@ -19,7 +22,10 @@ def generate(self, prompt, stop_words=[]): while count < MAX_TRY: response = dashscope.MultiModalConversation.call( self.model, - messages=[{'role': 'user', 'content': prompt}], + messages=[{ + 'role': 'user', + 'content': prompt + }], top_p=0.01, top_k=1, ) @@ -28,7 +34,7 @@ def generate(self, prompt, stop_words=[]): for stop_str in stop_words: idx = output.find(stop_str) if idx != -1: - output = output[: idx + len(stop_str)] + output = output[:idx + len(stop_str)] return output else: err = 'Error code: %s, error message: %s' % ( diff --git a/benchmark/models/qwen.py b/benchmark/models/qwen.py index 193208a..235feaf 100644 --- a/benchmark/models/qwen.py +++ b/benchmark/models/qwen.py @@ -26,11 +26,14 @@ def generate(self, input_text, stop_words=[]): class QwenVL(HFModel): + def __init__(self, model_path): super().__init__(model_path) def generate(self, inputs: list): query = self.tokenizer.from_list_format(inputs) - response, _ = self.model.chat(self.tokenizer, query=query, history=None) + response, _ = self.model.chat(self.tokenizer, + query=query, + history=None) - return response \ No newline at end of file + return response diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt index d8c77ca..748966e 100644 --- a/benchmark/requirements.txt +++ b/benchmark/requirements.txt @@ -3,6 +3,7 @@ func_timeout json5 matplotlib numpy +openai pandas PrettyTable scipy @@ -10,4 +11,3 @@ seaborn sympy transformers==4.33.1 transformers_stream_generator -openai \ No newline at end of file diff --git a/qwen_agent/tools/code_interpreter.py b/qwen_agent/tools/code_interpreter.py index f320f44..a85c74e 100644 --- a/qwen_agent/tools/code_interpreter.py +++ b/qwen_agent/tools/code_interpreter.py @@ -110,7 +110,7 @@ def _start_kernel(pid) -> BlockingKernelClient: # Client kc = BlockingKernelClient(connection_file=connection_file) - asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy()) + asyncio.set_event_loop_policy(AnyThreadEventLoopPolicy()) kc.load_connection_file() kc.start_channels() kc.wait_for_ready() @@ -317,24 +317,36 @@ def call(self, return result if result.strip() else 'Finished execution.' -def _get_multiline_input() -> str: - logger.info( - '// Press ENTER to make a new line. Press CTRL-D to end input.') - lines = [] - while True: - try: - line = input() - except EOFError: # CTRL-D - break - lines.append(line) - logger.info('// Input received.') - if lines: - return '\n'.join(lines) - else: - return '' +# +# The _BasePolicy and AnyThreadEventLoopPolicy below are borrowed from Tornado. +# Ref: https://www.tornadoweb.org/en/stable/_modules/tornado/platform/asyncio.html#AnyThreadEventLoopPolicy +# +if sys.platform == 'win32' and hasattr(asyncio, + 'WindowsSelectorEventLoopPolicy'): + _BasePolicy = asyncio.WindowsSelectorEventLoopPolicy # type: ignore +else: + _BasePolicy = asyncio.DefaultEventLoopPolicy -if __name__ == '__main__': - tool = CodeInterpreter() - while True: - logger.info(tool.call(_get_multiline_input())) + +class AnyThreadEventLoopPolicy(_BasePolicy): # type: ignore + """Event loop policy that allows loop creation on any thread. + + The default `asyncio` event loop policy only automatically creates + event loops in the main threads. Other threads must create event + loops explicitly or `asyncio.get_event_loop` (and therefore + `.IOLoop.current`) will fail. Installing this policy allows event + loops to be created automatically on any thread. + + Usage:: + asyncio.set_event_loop_policy(AnyThreadEventLoopPolicy()) + """ + + def get_event_loop(self) -> asyncio.AbstractEventLoop: + try: + return super().get_event_loop() + except RuntimeError: + # "There is no current event loop in thread %r" + loop = self.new_event_loop() + self.set_event_loop(loop) + return loop