Caption-Captain

Sleeping

App Files Files Community

Severian commited on Sep 30, 2024

Commit

348afd0

verified ·

1 Parent(s): 36176e7

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -47

app.py CHANGED Viewed

@@ -156,40 +156,30 @@ clip_model.to("cuda")
 # Tokenizer
 print("Loading tokenizer")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
 assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Tokenizer is of type {type(tokenizer)}"
 # LLM
 print("Loading LLM")
-if (CHECKPOINT_PATH / "text_model").exists:
-	print("Loading VLM's custom text model")
-	text_model = AutoModelForCausalLM.from_pretrained(CHECKPOINT_PATH / "text_model", device_map=0, torch_dtype=torch.bfloat16)
-else:
-	text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16)
 text_model.eval()
 # Image Adapter
 print("Loading image adapter")
 image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
-image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu", weights_only=True))
 image_adapter.eval()
 image_adapter.to("cuda")
 def preprocess_image(input_image: Image.Image) -> torch.Tensor:
-    """
-    Preprocess the input image for the CLIP model.
-    """
     image = input_image.resize((384, 384), Image.LANCZOS)
     pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
     pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
     return pixel_values.to('cuda')
 def generate_caption(text_model, tokenizer, image_features, prompt_str: str, max_new_tokens: int = 300) -> str:
-    """
-    Generate a caption based on the image features and prompt.
-    """
     prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
     prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
     embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
@@ -216,7 +206,7 @@ def generate_caption(text_model, tokenizer, image_features, prompt_str: str, max
     if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
         generate_ids = generate_ids[:, :-1]
-    return tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0].strip()
 @spaces.GPU()
 @torch.no_grad()
@@ -266,18 +256,33 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_length: str
     # For debugging
     print(f"Prompt: {prompt_str}")
     pixel_values = preprocess_image(input_image)
     with torch.amp.autocast_mode.autocast('cuda', enabled=True):
         vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
         image_features = vision_outputs.hidden_states
         embedded_images = image_adapter(image_features)
         embedded_images = embedded_images.to('cuda')
-    # Load the model from MODEL_PATH
-    text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16)
-    text_model.eval()
     caption = generate_caption(text_model, tokenizer, embedded_images, prompt_str)
     return prompt_str, caption.strip()
@@ -437,8 +442,8 @@ def login(username, password):
 # Gradio interface
 with gr.Blocks(theme="Hev832/Applio", css=css, fill_width=True, fill_height=True) as demo:
     with gr.Tab("Welcome"):
-        with gr.Row():
-            with gr.Column(scale=2):
                 gr.Markdown(
                     """
                     <img src="https://cdn-uploads.huggingface.co/production/uploads/64740cf7485a7c8e1bd51ac9/LVZnwLV43UUvKu3HORqSs.webp" alt="UDG" width="250" class="centered-image">
@@ -471,9 +476,9 @@ with gr.Blocks(theme="Hev832/Applio", css=css, fill_width=True, fill_height=True
                     )
                 with gr.Row():
-                    username = gr.Textbox(label="Username", placeholder="Enter your username", value="ugd")
                 with gr.Row():
-                    password = gr.Textbox(label="Password", type="password", placeholder="Enter your password", value="ugd!")
                 with gr.Row():
                     login_button = gr.Button("Login", size="sm")
                 login_message = gr.Markdown(visible=False)
@@ -485,7 +490,7 @@ with gr.Blocks(theme="Hev832/Applio", css=css, fill_width=True, fill_height=True
             # How to Use Caption Captain
             <img src="https://cdn-uploads.huggingface.co/production/uploads/64740cf7485a7c8e1bd51ac9/Ce_Z478iOXljvpZ_Fr_Y7.png" alt="Captain" width="100" style="max-width: 100%; height: auto;">
             Hello, artist! Let's create amazing captions for your pictures. Here's a comprehensive guide:
             1. **Upload Your Image**: Choose a picture you want to caption and upload it.
@@ -553,34 +558,35 @@ with gr.Blocks(theme="Hev832/Applio", css=css, fill_width=True, fill_height=True
                     value="long",
                 )
-                with gr.Accordion("Extra Options", open=True):
-                    extra_options = gr.CheckboxGroup(
-                        choices=[
-                            "If there is a person/character in the image you must refer to them as {name}.",
-                            "Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style).",
-                            "Include information about lighting.",
-                            "Include information about camera angle.",
-                            "Include information about whether there is a watermark or not.",
-                            "Include information about whether there are JPEG artifacts or not.",
-                            "If it is a photo you MUST include information about what camera was likely used and details such as aperture, shutter speed, ISO, etc.",
-                            "Do NOT include anything sexual; keep it PG.",
-                            "Do NOT mention the image's resolution.",
-                            "You MUST include information about the subjective aesthetic quality of the image from low to very high.",
-                            "Include information on the image's composition style, such as leading lines, rule of thirds, or symmetry.",
-                            "Do NOT mention any text that is in the image.",
-                            "Specify the depth of field and whether the background is in focus or blurred.",
-                            "If applicable, mention the likely use of artificial or natural lighting sources.",
-                            "Do NOT use any ambiguous language.",
-                            "Include whether the image is sfw, suggestive, or nsfw.",
-                            "ONLY describe the most important elements of the image."
-                        ],
-                    )
                 name_input = gr.Textbox(label="Person/Character Name (if applicable)")
                 gr.Markdown("**Note:** Name input is only used if an Extra Option is selected that requires it.")
                 custom_prompt = gr.Textbox(label="Custom Prompt (optional, will override all other settings)")
-                gr.Markdown("**Note:** Caption Captain is not great at general instruction following and will not follow prompts outside its training data well. Use this feature with caution.")
             with gr.Column():
                 error_message = gr.Markdown(visible=False)
@@ -646,6 +652,7 @@ with gr.Blocks(theme="Hev832/Applio", css=css, fill_width=True, fill_height=True
         outputs=[caption_captain_tab, username, password, login_message]
     )
     password.submit(
         login,
         inputs=[username, password],

 # Tokenizer
 print("Loading tokenizer")
+tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_PATH / "text_model", use_fast=True)
 assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Tokenizer is of type {type(tokenizer)}"
 # LLM
 print("Loading LLM")
+print("Loading VLM's custom text model")
+text_model = AutoModelForCausalLM.from_pretrained(CHECKPOINT_PATH / "text_model", device_map=0, torch_dtype=torch.bfloat16)
 text_model.eval()
 # Image Adapter
 print("Loading image adapter")
 image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
+image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu"))
 image_adapter.eval()
 image_adapter.to("cuda")
 def preprocess_image(input_image: Image.Image) -> torch.Tensor:
     image = input_image.resize((384, 384), Image.LANCZOS)
     pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
     pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
     return pixel_values.to('cuda')
 def generate_caption(text_model, tokenizer, image_features, prompt_str: str, max_new_tokens: int = 300) -> str:
     prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
     prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
     embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
     if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
         generate_ids = generate_ids[:, :-1]
+    return tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
 @spaces.GPU()
 @torch.no_grad()
     # For debugging
     print(f"Prompt: {prompt_str}")
+    # Preprocess image
     pixel_values = preprocess_image(input_image)
+    # Embed image
     with torch.amp.autocast_mode.autocast('cuda', enabled=True):
         vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
         image_features = vision_outputs.hidden_states
         embedded_images = image_adapter(image_features)
         embedded_images = embedded_images.to('cuda')
+    # Build the conversation
+    convo = [
+        {
+            "role": "system",
+            "content": "You are a helpful image captioner.",
+        },
+        {
+            "role": "user",
+            "content": prompt_str,
+        },
+    ]
+    # Format the conversation
+    convo_string = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
+    assert isinstance(convo_string, str)
+    # Generate caption
     caption = generate_caption(text_model, tokenizer, embedded_images, prompt_str)
     return prompt_str, caption.strip()
 # Gradio interface
 with gr.Blocks(theme="Hev832/Applio", css=css, fill_width=True, fill_height=True) as demo:
     with gr.Tab("Welcome"):
+        with gr.Row(elem_classes="welcome-tab"):
+            with gr.Column(scale=2, elem_classes="welcome-content"):
                 gr.Markdown(
                     """
                     <img src="https://cdn-uploads.huggingface.co/production/uploads/64740cf7485a7c8e1bd51ac9/LVZnwLV43UUvKu3HORqSs.webp" alt="UDG" width="250" class="centered-image">
                     )
                 with gr.Row():
+                    username = gr.Textbox(label="Username", placeholder="Enter your username")
                 with gr.Row():
+                    password = gr.Textbox(label="Password", type="password", placeholder="Enter your password")
                 with gr.Row():
                     login_button = gr.Button("Login", size="sm")
                 login_message = gr.Markdown(visible=False)
             # How to Use Caption Captain
             <img src="https://cdn-uploads.huggingface.co/production/uploads/64740cf7485a7c8e1bd51ac9/Ce_Z478iOXljvpZ_Fr_Y7.png" alt="Captain" width="100" style="max-width: 100%; height: auto;">
             Hello, artist! Let's create amazing captions for your pictures. Here's a comprehensive guide:
             1. **Upload Your Image**: Choose a picture you want to caption and upload it.
                     value="long",
                 )
+            with gr.Accordion("Extra Options", open=True):
+                extra_options = gr.CheckboxGroup(
+                    choices=[
+                        "If there is a person/character in the image you must refer to them as {name}.",
+                        "Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style).",
+                        "Include information about lighting.",
+                        "Include information about camera angle.",
+                        "Include information about whether there is a watermark or not.",
+                        "Include information about whether there are JPEG artifacts or not.",
+                        "If it is a photo you MUST include information about what camera was likely used and details such as aperture, shutter speed, ISO, etc.",
+                        "Do NOT include anything sexual; keep it PG.",
+                        "Do NOT mention the image's resolution.",
+                        "You MUST include information about the subjective aesthetic quality of the image from low to very high.",
+                        "Include information on the image's composition style, such as leading lines, rule of thirds, or symmetry.",
+                        "Do NOT mention any text that is in the image.",
+                        "Specify the depth of field and whether the background is in focus or blurred.",
+                        "If applicable, mention the likely use of artificial or natural lighting sources.",
+                        "Do NOT use any ambiguous language.",
+                        "Include whether the image is sfw, suggestive, or nsfw.",
+                        "ONLY describe the most important elements of the image."
+                    ],
+                    label="Select Extra Options"
+                )
                 name_input = gr.Textbox(label="Person/Character Name (if applicable)")
                 gr.Markdown("**Note:** Name input is only used if an Extra Option is selected that requires it.")
                 custom_prompt = gr.Textbox(label="Custom Prompt (optional, will override all other settings)")
+                gr.Markdown("**Note:** Alpha Two is not a general instruction follower and will not follow prompts outside its training data well. Use this feature with caution.")
             with gr.Column():
                 error_message = gr.Markdown(visible=False)
         outputs=[caption_captain_tab, username, password, login_message]
     )
+    # Add this new event listener for the password field
     password.submit(
         login,
         inputs=[username, password],