diff --git a/.env.example b/.env.example index d3d368d0..dae36a5a 100644 --- a/.env.example +++ b/.env.example @@ -54,4 +54,4 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2 # TTS_PRICES=0.015,0.030 # BOT_LANGUAGE=en # ENABLE_VISION_FOLLOW_UP_QUESTIONS="true" -# VISION_MODEL="gpt-4-vision-preview" \ No newline at end of file +# VISION_MODEL="gpt-4o" \ No newline at end of file diff --git a/README.md b/README.md index 61b7a41a..34dfa3ef 100644 --- a/README.md +++ b/README.md @@ -99,8 +99,8 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di | `SHOW_USAGE` | Whether to show OpenAI token usage information after each response | `false` | | `STREAM` | Whether to stream responses. **Note**: incompatible, if enabled, with `N_CHOICES` higher than 1 | `true` | | `MAX_TOKENS` | Upper bound on how many tokens the ChatGPT API will return | `1200` for GPT-3, `2400` for GPT-4 | -| `VISION_MAX_TOKENS` | Upper bound on how many tokens vision models will return | `300` for gpt-4-vision-preview | -| `VISION_MODEL` | The Vision to Speech model to use. Allowed values: `gpt-4-vision-preview` | `gpt-4-vision-preview` | +| `VISION_MAX_TOKENS` | Upper bound on how many tokens vision models will return | `300` for gpt-4o | +| `VISION_MODEL` | The Vision to Speech model to use. Allowed values: `gpt-4o` | `gpt-4o` | | `ENABLE_VISION_FOLLOW_UP_QUESTIONS` | If true, once you send an image to the bot, it uses the configured VISION_MODEL until the conversation ends. Otherwise, it uses the OPENAI_MODEL to follow the conversation. Allowed values: `true` or `false` | `true` | | `MAX_HISTORY_SIZE` | Max number of messages to keep in memory, after which the conversation will be summarised to avoid excessive token usage | `15` | | `MAX_CONVERSATION_AGE_MINUTES` | Maximum number of minutes a conversation should live since the last message, after which the conversation will be reset | `180` | diff --git a/bot/main.py b/bot/main.py index 8e0118d2..7c118ca8 100644 --- a/bot/main.py +++ b/bot/main.py @@ -53,7 +53,7 @@ def main(): 'bot_language': os.environ.get('BOT_LANGUAGE', 'en'), 'show_plugins_used': os.environ.get('SHOW_PLUGINS_USED', 'false').lower() == 'true', 'whisper_prompt': os.environ.get('WHISPER_PROMPT', ''), - 'vision_model': os.environ.get('VISION_MODEL', 'gpt-4-vision-preview'), + 'vision_model': os.environ.get('VISION_MODEL', 'gpt-4o'), 'enable_vision_follow_up_questions': os.environ.get('ENABLE_VISION_FOLLOW_UP_QUESTIONS', 'true').lower() == 'true', 'vision_prompt': os.environ.get('VISION_PROMPT', 'What is in this image'), 'vision_detail': os.environ.get('VISION_DETAIL', 'auto'), diff --git a/bot/openai_helper.py b/bot/openai_helper.py index 5a1896cf..ef87e625 100644 --- a/bot/openai_helper.py +++ b/bot/openai_helper.py @@ -26,8 +26,9 @@ GPT_3_16K_MODELS = ("gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-0125") GPT_4_MODELS = ("gpt-4", "gpt-4-0314", "gpt-4-0613", "gpt-4-turbo-preview") GPT_4_32K_MODELS = ("gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-0613") -GPT_4_VISION_MODELS = ("gpt-4-vision-preview",) -GPT_4_128K_MODELS = ("gpt-4-1106-preview","gpt-4-0125-preview","gpt-4-turbo-preview", "gpt-4-turbo", "gpt-4-turbo-2024-04-09") +GPT_4_VISION_MODELS = ("gpt-4o",) +GPT_4_128K_MODELS = ( +"gpt-4-1106-preview", "gpt-4-0125-preview", "gpt-4-turbo-preview", "gpt-4-turbo", "gpt-4-turbo-2024-04-09") GPT_4O_MODELS = ("gpt-4o",) GPT_ALL_MODELS = GPT_3_MODELS + GPT_3_16K_MODELS + GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS + GPT_4_128K_MODELS + GPT_4O_MODELS @@ -64,13 +65,13 @@ def are_functions_available(model: str) -> bool: if model in ("gpt-3.5-turbo-0301", "gpt-4-0314", "gpt-4-32k-0314"): return False # Stable models will be updated to support functions on June 27, 2023 - if model in ("gpt-3.5-turbo", "gpt-3.5-turbo-1106", "gpt-4", "gpt-4-32k","gpt-4-1106-preview","gpt-4-0125-preview","gpt-4-turbo-preview"): + if model in ( + "gpt-3.5-turbo", "gpt-3.5-turbo-1106", "gpt-4", "gpt-4-32k", "gpt-4-1106-preview", "gpt-4-0125-preview", + "gpt-4-turbo-preview", "gpt-4o"): return datetime.date.today() > datetime.date(2023, 6, 27) # Models gpt-3.5-turbo-0613 and gpt-3.5-turbo-16k-0613 will be deprecated on June 13, 2024 if model in ("gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613"): return datetime.date.today() < datetime.date(2024, 6, 13) - if model == 'gpt-4-vision-preview': - return False return True @@ -249,7 +250,8 @@ async def __common_get_chat_response(self, chat_id: int, query: str, stream=Fals self.conversations[chat_id] = self.conversations[chat_id][-self.config['max_history_size']:] common_args = { - 'model': self.config['model'] if not self.conversations_vision[chat_id] else self.config['vision_model'], + 'model': self.config['model'] if not self.conversations_vision[chat_id] else self.config[ + 'vision_model'], 'messages': self.conversations[chat_id], 'temperature': self.config['temperature'], 'n': self.config['n_choices'], @@ -385,7 +387,8 @@ async def transcribe(self, filename): try: with open(filename, "rb") as audio: prompt_text = self.config['whisper_prompt'] - result = await self.client.audio.transcriptions.create(model="whisper-1", file=audio, prompt=prompt_text) + result = await self.client.audio.transcriptions.create(model="whisper-1", file=audio, + prompt=prompt_text) return result.text except Exception as e: logging.exception(e) @@ -429,7 +432,7 @@ async def __common_get_chat_response_vision(self, chat_id: int, content: list, s if exceeded_max_tokens or exceeded_max_history_size: logging.info(f'Chat history for chat ID {chat_id} is too long. Summarising...') try: - + last = self.conversations[chat_id][-1] summary = await self.__summarise(self.conversations[chat_id][:-1]) logging.debug(f'Summary: {summary}') @@ -440,20 +443,19 @@ async def __common_get_chat_response_vision(self, chat_id: int, content: list, s logging.warning(f'Error while summarising chat history: {str(e)}. Popping elements instead...') self.conversations[chat_id] = self.conversations[chat_id][-self.config['max_history_size']:] - message = {'role':'user', 'content':content} + message = {'role': 'user', 'content': content} common_args = { 'model': self.config['vision_model'], 'messages': self.conversations[chat_id][:-1] + [message], 'temperature': self.config['temperature'], - 'n': 1, # several choices is not implemented yet + 'n': 1, # several choices is not implemented yet 'max_tokens': self.config['vision_max_tokens'], 'presence_penalty': self.config['presence_penalty'], 'frequency_penalty': self.config['frequency_penalty'], 'stream': stream } - # vision model does not yet support functions # if self.config['enable_functions']: @@ -461,7 +463,7 @@ async def __common_get_chat_response_vision(self, chat_id: int, content: list, s # if len(functions) > 0: # common_args['functions'] = self.plugin_manager.get_functions_specs() # common_args['function_call'] = 'auto' - + return await self.client.chat.completions.create(**common_args) except openai.RateLimitError as e: @@ -473,7 +475,6 @@ async def __common_get_chat_response_vision(self, chat_id: int, content: list, s except Exception as e: raise Exception(f"⚠️ _{localized_text('error', bot_language)}._ ⚠️\n{str(e)}") from e - async def interpret_image(self, chat_id, fileobj, prompt=None): """ Interprets a given PNG image file using the Vision model. @@ -481,15 +482,14 @@ async def interpret_image(self, chat_id, fileobj, prompt=None): image = encode_image(fileobj) prompt = self.config['vision_prompt'] if prompt is None else prompt - content = [{'type':'text', 'text':prompt}, {'type':'image_url', \ - 'image_url': {'url':image, 'detail':self.config['vision_detail'] } }] + content = [{'type': 'text', 'text': prompt}, {'type': 'image_url', \ + 'image_url': {'url': image, + 'detail': self.config['vision_detail']}}] response = await self.__common_get_chat_response_vision(chat_id, content) - - # functions are not available for this model - + # if self.config['enable_functions']: # response, plugins_used = await self.__handle_function_call(chat_id, response) # if is_direct_result(response): @@ -532,13 +532,12 @@ async def interpret_image_stream(self, chat_id, fileobj, prompt=None): image = encode_image(fileobj) prompt = self.config['vision_prompt'] if prompt is None else prompt - content = [{'type':'text', 'text':prompt}, {'type':'image_url', \ - 'image_url': {'url':image, 'detail':self.config['vision_detail'] } }] + content = [{'type': 'text', 'text': prompt}, {'type': 'image_url', \ + 'image_url': {'url': image, + 'detail': self.config['vision_detail']}}] response = await self.__common_get_chat_response_vision(chat_id, content, stream=True) - - # if self.config['enable_functions']: # response, plugins_used = await self.__handle_function_call(chat_id, response, stream=True) # if is_direct_result(response): @@ -557,8 +556,8 @@ async def interpret_image_stream(self, chat_id, fileobj, prompt=None): self.__add_to_history(chat_id, role="assistant", content=answer) tokens_used = str(self.__count_tokens(self.conversations[chat_id])) - #show_plugins_used = len(plugins_used) > 0 and self.config['show_plugins_used'] - #plugin_names = tuple(self.plugin_manager.get_plugin_source_name(plugin) for plugin in plugins_used) + # show_plugins_used = len(plugins_used) > 0 and self.config['show_plugins_used'] + # plugin_names = tuple(self.plugin_manager.get_plugin_source_name(plugin) for plugin in plugins_used) if self.config['show_usage']: answer += f"\n\n---\n💰 {tokens_used} {localized_text('stats_tokens', self.config['bot_language'])}" # if show_plugins_used: @@ -651,7 +650,12 @@ def __count_tokens(self, messages) -> int: """ model = self.config['model'] try: - encoding = tiktoken.encoding_for_model(model) + # TODO this is a temporary workaround until tiktoken is updated + # https://github.com/n3d1117/chatgpt-telegram-bot/issues/577 + if model in GPT_4O_MODELS: + encoding = tiktoken.get_encoding("p50k_base") + else: + encoding = tiktoken.encoding_for_model(model) except KeyError: encoding = tiktoken.get_encoding("gpt-3.5-turbo") @@ -697,7 +701,7 @@ def __count_tokens_vision(self, image_bytes: bytes) -> int: model = self.config['vision_model'] if model not in GPT_4_VISION_MODELS: raise NotImplementedError(f"""count_tokens_vision() is not implemented for model {model}.""") - + w, h = image.size if w > h: w, h = h, w # this computation follows https://platform.openai.com/docs/guides/vision and https://openai.com/pricing#gpt-4-turbo @@ -705,7 +709,7 @@ def __count_tokens_vision(self, image_bytes: bytes) -> int: detail = self.config['vision_detail'] if detail == 'low': return base_tokens - elif detail == 'high' or detail == 'auto': # assuming worst cost for auto + elif detail == 'high' or detail == 'auto': # assuming worst cost for auto f = max(w / 768, h / 2048) if f > 1: w, h = int(w / f), int(h / f) diff --git a/requirements.txt b/requirements.txt index 520d28da..36c1fa62 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,44 @@ -python-dotenv~=1.0.0 -pydub~=0.25.1 -tiktoken==0.7.0 +annotated-types==0.7.0 +anyio==4.4.0 +async-timeout==4.0.3 +backports.tarfile==1.2.0 +Brotli==1.1.0 +certifi==2024.7.4 +charset-normalizer==3.3.2 +click==8.1.7 +distro==1.9.0 +duckduckgo_search==5.3.1b1 +exceptiongroup==1.2.2 +gTTS==2.5.3 +h11==0.14.0 +h2==4.1.0 +hpack==4.0.0 +httpcore==1.0.5 +httpx==0.27.0 +hyperframe==6.0.1 +idna==3.8 +jaraco.context==6.0.1 +more-itertools==10.4.0 openai==1.29.0 +pillow==10.3.0 +pydantic==2.8.2 +pydantic_core==2.20.1 +pydub==0.25.1 +python-dotenv==1.0.1 python-telegram-bot==21.1.1 -requests~=2.31.0 +pytube==15.0.0 +redis==5.0.8 +regex==2024.7.24 +requests==2.31.0 +six==1.16.0 +sniffio==1.3.1 +socksio==1.0.0 +spotipy==2.23.0 tenacity==8.3.0 -wolframalpha~=5.0.0 -duckduckgo_search==5.3.1b1 -spotipy~=2.23.0 -pytube~=15.0.0 -gtts~=2.5.1 -whois~=0.9.27 -Pillow~=10.3.0 +tiktoken==0.7.0 +tqdm==4.66.5 +typing_extensions==4.12.2 +urllib3==2.2.2 +whois==0.9.27 +wolframalpha==5.0.1 +xmltodict==0.13.0