Spaces:

matjarm
/

model-comparison

Sleeping

App Files Files Community

matjarm commited on Dec 6, 2024

Commit

9d46b1d

1 Parent(s): b90a4c8

test

Browse files

Files changed (1) hide show

app.py +95 -195

app.py CHANGED Viewed

@@ -1,207 +1,107 @@
 import os
-import random
-import uuid
 import gradio as gr
-import numpy as np
 from PIL import Image
 import torch
-from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
-from typing import Tuple
-# CSS for Gradio Interface
-css = '''
-.gradio-container{max-width: 575px !important}
-h1{text-align:center}
-footer {
-    visibility: hidden
-}
-'''
-DESCRIPTION = """
-## Text-to-Image Generator 🚀
-Create stunning images from text prompts using Stable Diffusion XL. Explore high-quality styles and customizable options.
-"""
-# Example Prompts
-examples = [
-    "A beautiful sunset over the ocean, ultra-realistic, high resolution",
-    "A futuristic cityscape with flying cars, cyberpunk theme, vibrant colors",
-    "A cozy cabin in the woods during winter, detailed and realistic",
-    "A magical forest with glowing plants and creatures, fantasy art",
-]
-# Model Configurations
-MODEL_OPTIONS = {
-    "LIGHTNING V5.0": "SG161222/RealVisXL_V5.0_Lightning",
-    "LIGHTNING V4.0": "SG161222/RealVisXL_V4.0_Lightning",
-}
-# Define Styles
-style_list = [
-    {
-        "name": "Ultra HD",
-        "prompt": "hyper-realistic 8K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
-        "negative_prompt": "cartoonish, low resolution, blurry, simplistic, abstract, deformed, ugly",
-    },
-    {
-        "name": "4K Realistic",
-        "prompt": "realistic 4K image of {prompt}. sharp, detailed, vibrant colors, photorealistic",
-        "negative_prompt": "cartoonish, blurry, low resolution",
-    },
-    {
-        "name": "Minimal Style",
-        "prompt": "{prompt}, clean, minimalistic",
-        "negative_prompt": "",
-    },
-]
-styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
-DEFAULT_STYLE_NAME = "Ultra HD"
-# Define Global Variables
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-MAX_IMAGE_SIZE = 4096
-MAX_SEED = np.iinfo(np.int32).max
-# Load Model Function
-def load_and_prepare_model(model_id):
-    pipe = StableDiffusionXLPipeline.from_pretrained(
-        model_id,
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    ).to(device)
-    pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
-    return pipe
 # Load Models
-models = {key: load_and_prepare_model(value) for key, value in MODEL_OPTIONS.items()}
-# Generate Function
-def generate_image(
-    model_choice: str,
-    prompt: str,
-    negative_prompt: str,
-    style_name: str,
-    width: int,
-    height: int,
-    guidance_scale: float,
-    num_steps: int,
-    num_images: int,
-    randomize_seed: bool,
-    seed: int,
-):
-    # Apply Style
-    positive_style, negative_style = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
-    styled_prompt = positive_style.replace("{prompt}", prompt)
-    styled_negative_prompt = negative_style + (negative_prompt if negative_prompt else "")
-    # Randomize Seed if Enabled
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator(device=device).manual_seed(seed)
-    # Generate Images
-    pipe = models[model_choice]
-    images = pipe(
-        prompt=[styled_prompt] * num_images,
-        negative_prompt=[styled_negative_prompt] * num_images,
-        width=width,
-        height=height,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_steps,
-        generator=generator,
-        output_type="pil",
-    ).images
-    # Save and Return Images
-    image_paths = []
-    for img in images:
-        unique_name = f"{uuid.uuid4()}.png"
-        img.save(unique_name)
-        image_paths.append(unique_name)
-    return image_paths, seed
-# Gradio Interface
-with gr.Blocks(css=css) as demo:
-    gr.Markdown(DESCRIPTION)
-    with gr.Row():
-        model_choice = gr.Dropdown(
-            label="Select Model",
-            choices=list(MODEL_OPTIONS.keys()),
-            value="LIGHTNING V5.0",
-        )
-    prompt = gr.Textbox(
-        label="Prompt",
-        placeholder="Enter your creative prompt here...",
-    )
-    negative_prompt = gr.Textbox(
-        label="Negative Prompt",
-        placeholder="Optional: Add details you want to avoid...",
-        value="blurry, deformed, low-quality, cartoonish",
-    )
-    style_name = gr.Radio(
-        label="Style",
-        choices=list(styles.keys()),
-        value=DEFAULT_STYLE_NAME,
-    )
-    with gr.Accordion("Advanced Options", open=False):
-        width = gr.Slider(label="Width", minimum=512, maximum=2048, step=8, value=1024)
-        height = gr.Slider(label="Height", minimum=512, maximum=2048, step=8, value=1024)
-        guidance_scale = gr.Slider(
-            label="Guidance Scale",
-            minimum=1,
-            maximum=20,
-            step=0.5,
-            value=7.5,
-        )
-        num_steps = gr.Slider(
-            label="Steps",
-            minimum=1,
-            maximum=50,
-            step=1,
-            value=25,
-        )
-        num_images = gr.Slider(
-            label="Number of Images",
-            minimum=1,
-            maximum=5,
-            step=1,
-            value=1,
-        )
-        randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
-        seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42)
-    with gr.Row():
-        run_button = gr.Button("Generate Images")
-        result_gallery = gr.Gallery(label="Generated Images", show_label=False)
-    run_button.click(
-        generate_image,
-        inputs=[
-            model_choice,
-            prompt,
-            negative_prompt,
-            style_name,
-            width,
-            height,
-            guidance_scale,
-            num_steps,
-            num_images,
-            randomize_seed,
-            seed,
-        ],
-        outputs=[result_gallery, seed],
-    )
-    gr.Examples(
-        examples=examples,
-        inputs=prompt,
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch()

+import cv2
 import os
 import gradio as gr
+import requests
+from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+from transformers import BlipProcessor, BlipForConditionalGeneration
 from PIL import Image
 import torch
+import uuid
 # Load Models
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Model 1: ViT-GPT2
+model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
+feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+# Model 2: FuseCap
+processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap")
+model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device)
+# Model 3: BLIP Large
+processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
+# Frame Extraction and Captioning Logic
+def process_video(video_path):
+    vidObj = cv2.VideoCapture(video_path)
+    count = 0
+    success = True
+    frame_captions = {"Model 1": [], "Model 2": [], "Model 3": []}
+    while success:
+        success, frame = vidObj.read()
+        if not success:
+            break
+        # Process every 20th frame
+        if count % 20 == 0:
+            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            # Model 1: ViT-GPT2
+            pixel_values = feature_extractor1(images=[image], return_tensors="pt").pixel_values.to(device)
+            output_ids = model1.generate(pixel_values, max_length=16, num_beams=4)
+            caption1 = tokenizer1.decode(output_ids[0], skip_special_tokens=True)
+            frame_captions["Model 1"].append(caption1)
+            # Model 2: FuseCap
+            inputs = processor2(image, "a picture of ", return_tensors="pt").to(device)
+            out2 = model2.generate(**inputs, num_beams=3)
+            caption2 = processor2.decode(out2[0], skip_special_tokens=True)
+            frame_captions["Model 2"].append(caption2)
+            # Model 3: BLIP Large
+            inputs3 = processor3(image, return_tensors="pt").to(device)
+            out3 = model3.generate(**inputs3)
+            caption3 = processor3.decode(out3[0], skip_special_tokens=True)
+            frame_captions["Model 3"].append(caption3)
+        count += 1
+    vidObj.release()
+    return frame_captions
+# Gradio Interface
+def generate_captions(video):
+    # Save uploaded video
+    video_path = f"temp_{uuid.uuid4()}.mp4"
+    with open(video_path, "wb") as f:
+        f.write(video.read())
+    # Process video and get captions
+    captions = process_video(video_path)
+    # Clean up temporary file
+    os.remove(video_path)
+    # Format output for display
+    result = ""
+    for model_name, model_captions in captions.items():
+        result += f"### {model_name}\n"
+        result += "\n".join(f"- {caption}" for caption in model_captions)
+        result += "\n\n"
+    return result
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# Video Captioning with Multiple Models 🎥")
+    gr.Markdown("Upload a video to generate captions for its frames using three different models.")
+    video_input = gr.Video(label="Upload Video")
+    output = gr.Textbox(label="Generated Captions", lines=20)
+    submit_button = gr.Button("Generate Captions")
+    submit_button.click(
+        fn=generate_captions,
+        inputs=video_input,
+        outputs=output,
     )
 if __name__ == "__main__":
+    demo.launch()