Merge pull request #5 from Zhou-Shilin/main

feat!: Add advanced mode support
CubeGPT · May 5, 2024 · ae9d7bd · ae9d7bd
2 parents 9b1973c + 21f9c13
commit ae9d7bd
Show file tree

Hide file tree

Showing 5 changed files with 205 additions and 17 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
+__pycache__
 generated/*
 
 logs/*
 test.py
-__pycache__
+_config.yaml
diff --git a/config.yaml b/config.yaml
@@ -1,12 +1,34 @@
+########## EDIT REQUIRED ##########
+
 # GPT SETTINGS #
-# EDIT REQUIRED
 # Get your api key from openai. Remember google/bing is always your best friend.
 # Model names: gpt-4-turbo-preview, gpt-3.5-turbo, etc.
 # Recommend -> gpt-4-turbo-preview, which codes more accurately and is less likely to write bugs, but is more expensive.
 
-API_KEY: ""
+API_KEY: "" # Free API Key with GPT-4 access: https://github.com/CubeGPT/.github/discussions/1
 BASE_URL: "https://api.openai.com/v1/chat/completions"
-GENERATE_MODEL: "gpt-4-turbo-2024-04-09" # Don't use gpt-4, because this model is longer supports json modes.
+
+GENERATE_MODEL: "gpt-4-turbo-preview" # Don't use gpt-4, because this model is longer supports json modes.
+
+
+# ADVANCED MODE #
+# This mode is experimental. But we highly recommend you to enable this mode for better performance.
+ADVANCED_MODE: True
+IMAGE_GENERATION_MODEL: "dall-e-3"
+IMAGE_SIZE: "1024x1024"
+VISION_MODEL: "gpt-4-vision-preview"
+
+# Note: If you are using the free API key above, you can't use the advanced mode since it doesn't support dall-e-3 and gpt-4-vision-preview models.
+USE_DIFFERENT_APIKEY_FOR_DALLE_MODEL: False
+DALLE_API_KEY: ""
+DALLE_BASE_URL: "https://api.openai.com/v1/chat/completions"
+
+USE_DIFFERENT_APIKEY_FOR_VISION_MODEL: False
+VISION_API_KEY: ""
+VISION_BASE_URL: "https://api.openai.com/v1/chat/completions"
+
+
+########## EDIT OPTIONAL ##########
 
 # PROMPT SETTINGS #
 # If you don't know what it is, please don't touch it. Be sure to backup before editing.
@@ -48,6 +70,65 @@ SYS_GEN: |
 USR_GEN: | 
   %DESCRIPTION%
 
+## Advanced Mode ##
+
+### Programme ###
+BTR_DESC_SYS_GEN: |
+  You are an minecraft schematic designer. Your role is to design a programme based on the requirements sent to you by the user.
+  For exmaple,
+  User input: "A cafe."
+  Response: "A small cafe with a modern design, red roof and brown door, big windows. Inside, there's two tables and a bar."
+
+BTR_DESC_USR_GEN: |
+  %DESCRIPTION%
+
+### Image Tag Generation ###
+IMG_TAG_SYS_GEN: |
+  You work for a minecraft schematic company and you need to use AI to generate the design image based on designer's architectural programme. Please response the tags you'd like to use for the image generation.
+  Never response anything else.
+  Example resposne: "A minecraft building with a modern design, red roof and brown door, big windows."
+
+IMG_TAG_USR_GEN: |
+  Designer's programme: %PROGRAMME%
+
+### Stucture Generation (Advanced with gpt-4-vision) ###
+SYS_GEN_ADV: |
+  You are a minecraft structure builder bot. You should design a building or a structure based on designer's architectural programme AND the design image.
+  Response in json like this:
+  {
+      "materials": [
+            "A: \"minecraft:air\"",
+            "S: \"minecraft:stone\""
+      ],
+      "structures": [
+          {
+              "floor": 0,
+              "structure": "SSSSSSSS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSSSSSSSS"
+          },
+          {
+              "floor": 1,
+              "structure": "SSGGGGSS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSSSSSSSS"
+          },
+          {
+              "floor": 2,
+              "structure": "SSGGGGSS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSSSSSSSS"
+          },
+          {
+              "floor": 3,
+              "structure": "SSSSSSSS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSAAAAAAS\nSSSSSSSS"
+          },
+          {
+              "floor": 4,
+              "structure": "SSSSSSSS\nSSSSSSSS\nSSSSSSSS\nSSSSSSSS\nSSSSSSSS\nSSSSSSSS\nSSSSSSSS\nSSSSSSSS\nSSSSSSSS\nSSSSSSSS\n"
+          }
+      ]
+  }
+  Never response anything else. Do not design a building which is too large (more than 10 floors). Never use markdown format. Use \n for line feed.
+
+USR_GEN_ADV: |
+  %DESCRIPTION%
+  The image is attached below.
+
 # Developer Settings #
 DEBUG_MODE: True
 VERSION_NUMBER: "Alpha-1.0" #NEVER EDIT THIS IF YOU DON'T KNOW WHAT ARE YOU DOING
diff --git a/console.py b/console.py
@@ -37,6 +37,24 @@ def generate_plugin(description):
 
     return schem
 
+def get_schematic_advanced(description):
+    print("(Advanced Mode) Generating programme...")
+    programme = core.askgpt(config.BTR_DESC_SYS_GEN, config.BTR_DESC_USR_GEN.replace("%DESCRIPTION%", description), config.GENERATE_MODEL, disable_json_mode=True)
+
+    print("(Advanced Mode) Generating image tag...")
+    image_tag = core.askgpt(config.IMG_TAG_SYS_GEN, config.IMG_TAG_USR_GEN.replace("%PROGRAMME%", programme), config.GENERATE_MODEL, disable_json_mode=True)
+
+    print("(Advanced Mode) Generating image...")
+    tag = image_tag + ", minecraft)"
+    image_url = core.ask_dall_e(tag)
+
+    print("(Advanced Mode) Generating schematic...")
+    response = core.askgpt(config.SYS_GEN_ADV, config.USR_GEN_ADV.replace("%DESCRIPTION%", description), config.VISION_MODEL, image_url=image_url)
+
+    schem = core.text_to_schem(response)
+
+    return schem
+
 if __name__ == "__main__":
     core.initialize()
 
@@ -54,7 +72,11 @@ def generate_plugin(description):
 
     print("Generating...")
 
-    schem = generate_plugin(description)
+    if config.ADVANCED_MODE:
+        print("Advanced mode is enabled. Generating a schematic with advanced features.")
+        schem = get_schematic_advanced(description)
+    else:
+        schem = generate_plugin(description)
 
     logger(f"console: Saving {name}.schem to generated/ folder.")
     version_tag = core.input_version_to_mcs_tag(version)

diff --git a/core.py b/core.py
@@ -2,6 +2,9 @@
 import mcschematic
 import sys
 import json
+import requests
+import base64
+import uuid
 
 from log_writer import logger
 import config
@@ -20,35 +23,59 @@ def initialize():
     """
     logger(f"Launch. Software version {config.VERSION_NUMBER}, platform {sys.platform}")
 
-def askgpt(system_prompt: str, user_prompt: str, model_name: str):
+def askgpt(system_prompt: str, user_prompt: str, model_name: str, disable_json_mode: bool = False, image_url: str = None):
     """
     Interacts with ChatGPT using the specified prompts.
 
     Args:
         system_prompt (str): The system prompt.
         user_prompt (str): The user prompt.
+        model_name (str): The model name to use.
+        disable_json_mode (bool): Whether to disable JSON mode.
 
     Returns:
         str: The response from ChatGPT.
     """
-    client = OpenAI(api_key=config.API_KEY, base_url=config.BASE_URL)
+    if image_url is not None and config.USE_DIFFERENT_APIKEY_FOR_VISION_MODEL:
+        logger("Using different API key for vision model.")
+        client = OpenAI(api_key=config.VISION_API_KEY, base_url=config.VISION_BASE_URL)
+    else:
+        client = OpenAI(api_key=config.API_KEY, base_url=config.BASE_URL)
+
     logger("Initialized the OpenAI client.")
 
     # Define the messages for the conversation
-    messages = [
-        {"role": "system", "content": system_prompt},
-        {"role": "user", "content": user_prompt}
-    ]
+    if image_url is not None:
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": [
+                {"type": "text", "text": user_prompt},
+                {"type": "image_url", "image_url": {"url": image_url}}
+                ]
+            }
+        ]
+    else:
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ]
+
 
     logger(f"askgpt: system {system_prompt}")
     logger(f"askgpt: user {user_prompt}")
 
     # Create a chat completion
-    response = client.chat.completions.create(
-        model=model_name,
-        response_format={"type": "json_object"},
-        messages=messages
-    )
+    if disable_json_mode:
+        response = client.chat.completions.create(
+            model=model_name,
+            messages=messages
+        )
+    else:
+        response = client.chat.completions.create(
+            model=model_name,
+            response_format={"type": "json_object"},
+            messages=messages
+        )
 
     logger(f"askgpt: response {response}")
 
@@ -57,6 +84,37 @@ def askgpt(system_prompt: str, user_prompt: str, model_name: str):
     logger(f"askgpt: extracted reply {assistant_reply}")
     return assistant_reply
 
+def ask_dall_e(description: str):
+    """
+    Generates a design image using the DALL-E API.
+
+    Args:
+        description (str): The prompt or description for generating the image.
+
+    Returns:
+        str: The URL of the generated image.
+    """
+    if config.USE_DIFFERENT_APIKEY_FOR_DALLE_MODEL:
+        client = OpenAI(api_key=config.DALLE_API_KEY, base_url=config.DALLE_BASE_URL)
+    else:
+        client = OpenAI(api_key=config.API_KEY, base_url=config.BASE_URL)
+
+    logger("ask_dall_e: Generating design image using DALL-E API.")
+
+    response = client.images.generate(
+        model=config.IMAGE_GENERATION_MODEL,
+        prompt=description,
+        size=config.IMAGE_SIZE,
+        quality="standard",
+        n=1,
+    )
+
+    image_url = response.data[0].url
+
+    logger(f"ask_dall_e: Generated image URL {image_url}")
+
+    return image_url
+
 def text_to_schem(text: str):
     """
     Converts a JSON string to a Minecraft schematic.

diff --git a/ui.py b/ui.py
@@ -29,6 +29,24 @@ def get_schematic(description):
 
     return schem
 
+def get_schematic_advanced(description):
+    print("(Advanced Mode) Generating programme...")
+    programme = core.askgpt(config.BTR_DESC_SYS_GEN, config.BTR_DESC_USR_GEN.replace("%DESCRIPTION%", description), config.GENERATE_MODEL, disable_json_mode=True)
+
+    print("(Advanced Mode) Generating image tag...")
+    image_tag = core.askgpt(config.IMG_TAG_SYS_GEN, config.IMG_TAG_USR_GEN.replace("%PROGRAMME%", programme), config.GENERATE_MODEL, disable_json_mode=True)
+
+    print("(Advanced Mode) Generating image...")
+    tag = image_tag + ", minecraft)"
+    image_url = core.ask_dall_e(tag)
+
+    print("(Advanced Mode) Generating schematic...")
+    response = core.askgpt(config.SYS_GEN_ADV, config.USR_GEN_ADV.replace("%DESCRIPTION%", description), config.VISION_MODEL, image_url=image_url)
+
+    schem = core.text_to_schem(response)
+
+    return schem
+
 def generate_schematic():
     """
     Generates a schematic file based on user input.
@@ -42,6 +60,11 @@ def generate_schematic():
     """
     generate_button.config(state=tk.DISABLED, text="Generating...")
 
+    if config.ADVANCED_MODE:
+        msgbox.showwarning("Warning", "You are using advanced mode. This mode will generate schematic with higher quality, but it may take longer to generate.")
+
+    msgbox.showinfo("Info", "It is expected to take 30 seconds to 5 minutes. The programme may \"not responding\", this is normal, just be patient. DO NOT CLOSE THE PROGRAM. Click the button below to start generating.")
+
     version = version_entry.get()
     name = name_entry.get()
     description = description_entry.get()
@@ -50,7 +73,10 @@ def generate_schematic():
     logger(f"console: input name {name}")
     logger(f"console: input description {description}")
 
-    schem = get_schematic(description)
+    if config.ADVANCED_MODE:
+        schem = get_schematic_advanced(description)
+    else:
+        schem = get_schematic(description)
 
     logger(f"console: Saving {name}.schem to generated/ folder.")
     version_tag = core.input_version_to_mcs_tag(version)