add joy caption2 (#180)

* add joy caption2 * reformat * delete redundant log * update the docs * reformat * update the return * update the docs for more details * reformat --------- Co-authored-by: Yao Chi <[email protected]>
siliconflow · Oct 30, 2024 · 3bb5843 · 3bb5843
1 parent 035258f
commit 3bb5843
Show file tree

Hide file tree

Showing 5 changed files with 203 additions and 0 deletions.
diff --git a/docs/docs/ai-assistants/imgs/siliconcloud-joycaption2-api.png b/docs/docs/ai-assistants/imgs/siliconcloud-joycaption2-api.png
diff --git a/docs/docs/ai-assistants/imgs/siliconcloud-joycaption2-caption_type.png b/docs/docs/ai-assistants/imgs/siliconcloud-joycaption2-caption_type.png
diff --git a/docs/docs/ai-assistants/imgs/siliconcloud-joycaption2-example.png b/docs/docs/ai-assistants/imgs/siliconcloud-joycaption2-example.png
diff --git a/docs/docs/ai-assistants/introduce.md b/docs/docs/ai-assistants/introduce.md
@@ -37,3 +37,61 @@ For the most current information on available models and pricing, please refer t
 ## ☁️BizyAir Joy Caption
 
 The ☁️BizyAir Joy Caption node is a powerful tool designed to automatically generate descriptive captions for images, thanks to https://huggingface.co/spaces/fancyfeast/joy-caption-pre-alpha.
+
+## ☁️BizyAir Joy Caption2
+
+The ☁️BizyAir Joy Caption2 node is an upgraded version of ☁️BizyAir Joy Caption node, thanks to https://huggingface.co/spaces/fancyfeast/joy-caption-alpha-two.
+
+### Key Features:
+
+![](./imgs/siliconcloud-joycaption2-api.png)
+
+1. **do_sample**: The do_sample parameter determines whether the model uses a random sampling method to generate the next word, or simply selects the most likely word.
+
+    - `do_sample=True`: It can increased variety and creativity of generated text.
+
+    - `do_sample=False`: The next word with the highest probability will be selected, and the content of the article will be conservative.
+
+2. **temperature**: The temperature parameter affects the shape of the probability distribution when sampling, and thus the variety of generated text.
+
+    - A higher temperature will make the distribution more uniform and increase randomness.
+
+    - A lower temperature makes the distribution sharper, less random, and more inclined to choose words with higher probability.
+
+3. **max_tokens**: The max_tokens parameter specifies the maximum number of tokens that the model can generate when generating text. The upper limit here is 512.
+
+4. **caption_type**: Each caption_type corresponds to the default system prompts.
+
+    ![](./imgs/siliconcloud-joycaption2-caption_type.png)
+
+    - **Descriptive**: Write a descriptive caption for this image in a formal tone.
+
+    - **Descriptive (Informal)**: Write a descriptive caption for this image in a casual tone.
+
+    - **Training Prompt**: Write a stable diffusion prompt for this image.
+
+    - **MidJourney**: Write a MidJourney prompt for this image.
+
+    - **Booru tag list**: Write a list of Booru tags for this image.
+
+    - **Booru-like tag list**: Write a list of Booru-like tags for this image.
+
+    - **Art Critic**: Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc.
+
+    - **Product Listing**: Write a caption for this image as though it were a product listing.
+
+    - **Social Media Post**: Write a caption for this image as if it were being used for a social media post.
+
+5. **caption_length**: The caption_length parameter is the length of the output. If the max_tokens parameter is less than it, the output will be truncated.
+
+6. **extra_options**: If you want to add more prompts to the default prompts, you can write here.
+
+    - For example, if you want to describe the person in the picture as someone, you can write as follow: `If there is a person/character in the image you must refer to them as {name}.`
+
+7. **name_input**: The name in the *name_input* can replace the `{name}` in the *extra_options*.
+
+    - For example, you write `Jack` here and write `If there is a person/character in the image you must refer to them as {name}.` in the *extra_options*, it will be found that the person in the image is named as Jack in the output.
+
+    ![](./imgs/siliconcloud-joycaption2-example.png)
+
+8. **custom_prompt**: If you want to customize the prompts, you can write here to override the prompts and previous actions related to the prompts(*caption_type*, *extra_options* and *extra_options*) will be invalid.
diff --git a/llm.py b/llm.py
@@ -272,13 +272,158 @@ def joycaption(self, image, do_sample, temperature, max_tokens):
         return (caption,)
 
 
+class BizyAirJoyCaption2:
+    def __init__(self):
+        pass
+
+    # refer to: https://huggingface.co/spaces/fancyfeast/joy-caption-pre-alpha
+    API_URL = f"{BIZYAIR_SERVER_ADDRESS}/supernode/joycaption2"
+
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "image": ("IMAGE",),
+                "do_sample": ([True, False],),
+                "temperature": (
+                    "FLOAT",
+                    {
+                        "default": 0.5,
+                        "min": 0.0,
+                        "max": 2.0,
+                        "step": 0.01,
+                        "round": 0.001,
+                        "display": "number",
+                    },
+                ),
+                "max_tokens": (
+                    "INT",
+                    {
+                        "default": 256,
+                        "min": 16,
+                        "max": 512,
+                        "step": 16,
+                        "display": "number",
+                    },
+                ),
+                "caption_type": (
+                    [
+                        "Descriptive",
+                        "Descriptive (Informal)",
+                        "Training Prompt",
+                        "MidJourney",
+                        "Booru tag list",
+                        "Booru-like tag list",
+                        "Art Critic",
+                        "Product Listing",
+                        "Social Media Post",
+                    ],
+                ),
+                "caption_length": (
+                    ["any", "very short", "short", "medium-length", "long", "very long"]
+                    + [str(i) for i in range(20, 261, 10)],
+                ),
+                "extra_options": (
+                    "STRING",
+                    {
+                        "default": "If there is a person/character in the image you must refer to them as {name}.",
+                        "tooltip": "Extra options for the model",
+                        "multiline": True,
+                    },
+                ),
+                "name_input": (
+                    "STRING",
+                    {
+                        "default": "Jack",
+                        "tooltip": "Name input is only used if an Extra Option is selected that requires it.",
+                    },
+                ),
+                "custom_prompt": (
+                    "STRING",
+                    {
+                        "default": "",
+                        "multiline": True,
+                    },
+                ),
+            }
+        }
+
+    RETURN_TYPES = ("STRING",)
+    FUNCTION = "joycaption2"
+
+    CATEGORY = "☁️BizyAir/AI Assistants"
+
+    def joycaption2(
+        self,
+        image,
+        do_sample,
+        temperature,
+        max_tokens,
+        caption_type,
+        caption_length,
+        extra_options,
+        name_input,
+        custom_prompt,
+    ):
+        API_KEY = get_api_key()
+        SIZE_LIMIT = 1536
+        _, w, h, c = image.shape
+        assert (
+            w <= SIZE_LIMIT and h <= SIZE_LIMIT
+        ), f"width and height must be less than {SIZE_LIMIT}x{SIZE_LIMIT}, but got {w} and {h}"
+
+        payload = {
+            "image": None,
+            "do_sample": do_sample == True,
+            "temperature": temperature,
+            "max_new_tokens": max_tokens,
+            "caption_type": caption_type,
+            "caption_length": caption_length,
+            "extra_options": [extra_options],
+            "name_input": name_input,
+            "custom_prompt": custom_prompt,
+        }
+        auth = f"Bearer {API_KEY}"
+        headers = {
+            "accept": "application/json",
+            "content-type": "application/json",
+            "authorization": auth,
+        }
+        input_image = encode_data(image, disable_image_marker=True)
+        payload["image"] = input_image
+
+        ret: str = send_post_request(self.API_URL, payload=payload, headers=headers)
+        ret = json.loads(ret)
+
+        try:
+            if "result" in ret:
+                ret = json.loads(ret["result"])
+        except Exception as e:
+            raise Exception(f"Unexpected response: {ret} {e=}")
+
+        if ret["type"] == "error":
+            raise Exception(ret["message"])
+
+        msg = ret["data"]
+        if msg["type"] not in (
+            "comfyair",
+            "bizyair",
+        ):
+            raise Exception(f"Unexpected response type: {msg}")
+
+        caption = msg["data"]
+        return (caption,)
+
+
 NODE_CLASS_MAPPINGS = {
     "BizyAirSiliconCloudLLMAPI": SiliconCloudLLMAPI,
     "BizyAirSiliconCloudVLMAPI": SiliconCloudVLMAPI,
     "BizyAirJoyCaption": BizyAirJoyCaption,
+    "BizyAirJoyCaption2": BizyAirJoyCaption2,
 }
 NODE_DISPLAY_NAME_MAPPINGS = {
     "BizyAirSiliconCloudLLMAPI": "☁️BizyAir SiliconCloud LLM API",
     "BizyAirSiliconCloudVLMAPI": "☁️BizyAir SiliconCloud VLM API",
     "BizyAirJoyCaption": "☁️BizyAir Joy Caption",
+    "BizyAirJoyCaption2": "☁️BizyAir Joy Caption2",
 }