Chroma-Extra

Running on Zero

App Files Files Community

gokaygokay commited on Feb 5

Commit

91840f8

1 Parent(s): 13fa156

understanding

Browse files

Files changed (3) hide show

app.py +137 -109
llm_inference_video.py +12 -1
vlm_captions.py +64 -0

app.py CHANGED Viewed

@@ -17,119 +17,147 @@ def create_video_interface():
     with gr.Blocks(theme='bethecloud/storj_theme') as demo:
         gr.HTML(title)
-        with gr.Row():
-            with gr.Column(scale=1):
-                input_concept = gr.Textbox(label="Core Concept/Thematic Input", lines=3)
-                style = gr.Dropdown(
-                    choices=["Minimalist", "Simple", "Detailed", "Descriptive", "Dynamic",
-                            "Cinematic", "Documentary", "Animation", "Action", "Experimental"],
-                    value="Simple",
-                    label="Video Style"
-                )
-                custom_elements = gr.Textbox(label="Custom Technical Elements",
-                                           placeholder="e.g., Infrared hybrid, Datamosh transitions")
-                prompt_length = gr.Dropdown(
-                    choices=["Short", "Medium", "Long"],
-                    value="Medium",
-                    label="Prompt Length"
-                )
-            with gr.Column(scale=1):
-                camera_direction = gr.Dropdown(
-                    choices=[
-                        "None",
-                        "Zoom in", "Zoom out", "Pan left", "Pan right",
-                        "Tilt up", "Tilt down", "Orbital rotation",
-                        "Push in", "Pull out", "Track forward", "Track backward",
-                        "Spiral in", "Spiral out", "Arc movement",
-                        "Diagonal traverse", "Vertical rise", "Vertical descent"
-                    ],
-                    value="None",
-                    label="Camera Direction"
-                )
-                camera_style = gr.Dropdown(
-                    choices=[
-                        "None",
-                        "Steadicam flow", "Drone aerials", "Handheld urgency", "Crane elegance",
-                        "Dolly precision", "VR 360", "Multi-angle rig", "Static tripod",
-                        "Gimbal smoothness", "Slider motion", "Jib sweep", "POV immersion",
-                        "Time-slice array", "Macro extreme", "Tilt-shift miniature",
-                        "Snorricam character", "Whip pan dynamics", "Dutch angle tension",
-                        "Underwater housing", "Periscope lens"
-                    ],
-                    value="None",
-                    label="Camera Movement Style"
-                )
-                pacing = gr.Dropdown(
-                    choices=[
-                        "None",
-                        "Slow burn", "Rhythmic pulse", "Frantic energy", "Ebb and flow",
-                        "Hypnotic drift", "Time-lapse rush", "Stop-motion staccato",
-                        "Gradual build", "Quick cut rhythm", "Long take meditation",
-                        "Jump cut energy", "Match cut flow", "Cross-dissolve dreamscape",
-                        "Parallel action", "Slow motion impact", "Ramping dynamics",
-                        "Montage tempo", "Continuous flow", "Episodic breaks"
-                    ],
-                    value="None",
-                    label="Pacing Rhythm"
-                )
-                special_effects = gr.Dropdown(
-                    choices=[
-                        "None",
-                        "Practical effects", "CGI enhancement", "Analog glitches",
-                        "Light painting", "Projection mapping", "Nanosecond exposures",
-                        "Double exposure", "Smoke diffusion", "Lens flare artistry",
-                        "Particle systems", "Holographic overlay", "Chromatic aberration",
-                        "Digital distortion", "Wire removal", "Motion capture",
-                        "Miniature integration", "Weather simulation", "Color grading",
-                        "Mixed media composite", "Neural style transfer"
-                    ],
-                    value="None",
-                    label="SFX Approach"
-                )
-            with gr.Column(scale=1):
-                provider = gr.Dropdown(
-                    choices=["SambaNova", "Groq"],
-                    value="SambaNova",
-                    label="LLM Provider"
-                )
-                model = gr.Dropdown(
-                    choices=[
                         "Meta-Llama-3.1-70B-Instruct",
                         "Meta-Llama-3.1-405B-Instruct",
                         "Meta-Llama-3.1-8B-Instruct"
-                    ],
-                    value="Meta-Llama-3.1-70B-Instruct",
-                    label="Model"
-                )
-                generate_btn = gr.Button("Generate Video Prompt", variant="primary")
-                output = gr.Textbox(label="Generated Prompt", lines=12, show_copy_button=True)
-        def update_models(provider):
-            models = {
-                "Groq": ["llama-3.3-70b-versatile"],
-                "SambaNova": [
-                    "Meta-Llama-3.1-70B-Instruct",
-                    "Meta-Llama-3.1-405B-Instruct",
-                    "Meta-Llama-3.1-8B-Instruct"
-                ]
-            }
-            return gr.Dropdown(choices=models[provider], value=models[provider][0])
-        provider.change(update_models, inputs=provider, outputs=model)
-        generate_btn.click(
-            llm_node.generate_video_prompt,
-            inputs=[input_concept, style, camera_style, camera_direction, pacing, special_effects,
-                   custom_elements, provider, model, prompt_length],
-            outputs=output
-        )
     return demo

     with gr.Blocks(theme='bethecloud/storj_theme') as demo:
         gr.HTML(title)
+        with gr.Tab("Video Prompt Generator"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    input_concept = gr.Textbox(label="Core Concept/Thematic Input", lines=3)
+                    style = gr.Dropdown(
+                        choices=["Minimalist", "Simple", "Detailed", "Descriptive", "Dynamic",
+                                "Cinematic", "Documentary", "Animation", "Action", "Experimental"],
+                        value="Simple",
+                        label="Video Style"
+                    )
+                    custom_elements = gr.Textbox(label="Custom Technical Elements",
+                                               placeholder="e.g., Infrared hybrid, Datamosh transitions")
+                    prompt_length = gr.Dropdown(
+                        choices=["Short", "Medium", "Long"],
+                        value="Medium",
+                        label="Prompt Length"
+                    )
+                with gr.Column(scale=1):
+                    camera_direction = gr.Dropdown(
+                        choices=[
+                            "None",
+                            "Zoom in", "Zoom out", "Pan left", "Pan right",
+                            "Tilt up", "Tilt down", "Orbital rotation",
+                            "Push in", "Pull out", "Track forward", "Track backward",
+                            "Spiral in", "Spiral out", "Arc movement",
+                            "Diagonal traverse", "Vertical rise", "Vertical descent"
+                        ],
+                        value="None",
+                        label="Camera Direction"
+                    )
+                    camera_style = gr.Dropdown(
+                        choices=[
+                            "None",
+                            "Steadicam flow", "Drone aerials", "Handheld urgency", "Crane elegance",
+                            "Dolly precision", "VR 360", "Multi-angle rig", "Static tripod",
+                            "Gimbal smoothness", "Slider motion", "Jib sweep", "POV immersion",
+                            "Time-slice array", "Macro extreme", "Tilt-shift miniature",
+                            "Snorricam character", "Whip pan dynamics", "Dutch angle tension",
+                            "Underwater housing", "Periscope lens"
+                        ],
+                        value="None",
+                        label="Camera Movement Style"
+                    )
+                    pacing = gr.Dropdown(
+                        choices=[
+                            "None",
+                            "Slow burn", "Rhythmic pulse", "Frantic energy", "Ebb and flow",
+                            "Hypnotic drift", "Time-lapse rush", "Stop-motion staccato",
+                            "Gradual build", "Quick cut rhythm", "Long take meditation",
+                            "Jump cut energy", "Match cut flow", "Cross-dissolve dreamscape",
+                            "Parallel action", "Slow motion impact", "Ramping dynamics",
+                            "Montage tempo", "Continuous flow", "Episodic breaks"
+                        ],
+                        value="None",
+                        label="Pacing Rhythm"
+                    )
+                    special_effects = gr.Dropdown(
+                        choices=[
+                            "None",
+                            "Practical effects", "CGI enhancement", "Analog glitches",
+                            "Light painting", "Projection mapping", "Nanosecond exposures",
+                            "Double exposure", "Smoke diffusion", "Lens flare artistry",
+                            "Particle systems", "Holographic overlay", "Chromatic aberration",
+                            "Digital distortion", "Wire removal", "Motion capture",
+                            "Miniature integration", "Weather simulation", "Color grading",
+                            "Mixed media composite", "Neural style transfer"
+                        ],
+                        value="None",
+                        label="SFX Approach"
+                    )
+                with gr.Column(scale=1):
+                    provider = gr.Dropdown(
+                        choices=["SambaNova", "Groq"],
+                        value="SambaNova",
+                        label="LLM Provider"
+                    )
+                    model = gr.Dropdown(
+                        choices=[
+                            "Meta-Llama-3.1-70B-Instruct",
+                            "Meta-Llama-3.1-405B-Instruct",
+                            "Meta-Llama-3.1-8B-Instruct"
+                        ],
+                        value="Meta-Llama-3.1-70B-Instruct",
+                        label="Model"
+                    )
+                    generate_btn = gr.Button("Generate Video Prompt", variant="primary")
+                    output = gr.Textbox(label="Generated Prompt", lines=12, show_copy_button=True)
+            def update_models(provider):
+                models = {
+                    "Groq": ["llama-3.3-70b-versatile"],
+                    "SambaNova": [
                         "Meta-Llama-3.1-70B-Instruct",
                         "Meta-Llama-3.1-405B-Instruct",
                         "Meta-Llama-3.1-8B-Instruct"
+                    ]
+                }
+                return gr.Dropdown(choices=models[provider], value=models[provider][0])
+            provider.change(update_models, inputs=provider, outputs=model)
+            generate_btn.click(
+                llm_node.generate_video_prompt,
+                inputs=[input_concept, style, camera_style, camera_direction, pacing, special_effects,
+                       custom_elements, provider, model, prompt_length],
+                outputs=output
+            )
+        with gr.Tab("Visual Analysis"):
+            with gr.Row():
+                with gr.Column():
+                    image_input = gr.Image(label="Upload Image")
+                    image_question = gr.Textbox(
+                        label="Question (optional)",
+                        placeholder="What is in this image?"
+                    )
+                    analyze_image_btn = gr.Button("Analyze Image")
+                    image_output = gr.Textbox(label="Analysis Result", lines=5)
+                with gr.Column():
+                    video_input = gr.Video(label="Upload Video")
+                    analyze_video_btn = gr.Button("Analyze Video")
+                    video_output = gr.Textbox(label="Video Analysis", lines=10)
+            analyze_image_btn.click(
+                llm_node.analyze_image,
+                inputs=[image_input, image_question],
+                outputs=image_output
+            )
+            analyze_video_btn.click(
+                llm_node.analyze_video,
+                inputs=video_input,
+                outputs=video_output
+            )
     return demo

llm_inference_video.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import random
 from groq import Groq
 from openai import OpenAI
-from gradio_client import Client
 class VideoLLMInferenceNode:
     def __init__(self):
@@ -14,6 +14,17 @@ class VideoLLMInferenceNode:
             api_key=self.sambanova_api_key,
             base_url="https://api.sambanova.ai/v1",
         )
     def generate_video_prompt(
         self,

 import random
 from groq import Groq
 from openai import OpenAI
+from vlm_captions import VLMCaptioning
 class VideoLLMInferenceNode:
     def __init__(self):
             api_key=self.sambanova_api_key,
             base_url="https://api.sambanova.ai/v1",
         )
+        # Initialize VLM captioning
+        self.vlm = VLMCaptioning()
+    def analyze_image(self, image_path, question=None):
+        """Analyze image using MiniCPM-O"""
+        return self.vlm.analyze_image(image_path, question)
+    def analyze_video(self, video_path):
+        """Analyze video using MiniCPM-O"""
+        return self.vlm.analyze_video_frames(video_path)
     def generate_video_prompt(
         self,

vlm_captions.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+from decord import VideoReader, cpu
+import spaces
+class VLMCaptioning:
+    def __init__(self):
+        print("Loading MiniCPM-O model...")
+        self.model = AutoModel.from_pretrained(
+            'openbmb/MiniCPM-o-2_6',
+            trust_remote_code=True,
+            attn_implementation='sdpa',
+            torch_dtype=torch.bfloat16
+        )
+        self.model = self.model.eval().cuda()
+        self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
+    @spaces.GPU()
+    def analyze_image(self, image_path, question="Describe this image in detail."):
+        """Generate description for a single image"""
+        try:
+            image = Image.open(image_path).convert('RGB')
+            msgs = [{'role': 'user', 'content': [image, question]}]
+            response = self.model.chat(
+                image=None,
+                msgs=msgs,
+                tokenizer=self.tokenizer
+            )
+            return response
+        except Exception as e:
+            return f"Error analyzing image: {str(e)}"
+    @spaces.GPU()
+    def analyze_video_frames(self, video_path, frame_interval=30):
+        """Extract and analyze frames from video"""
+        try:
+            # Load video
+            vr = VideoReader(video_path, ctx=cpu(0))
+            total_frames = len(vr)
+            # Extract frames at intervals
+            frame_indices = list(range(0, total_frames, frame_interval))
+            frames = vr.get_batch(frame_indices).asnumpy()
+            descriptions = []
+            for frame in frames:
+                # Convert frame to PIL Image
+                frame_pil = Image.fromarray(frame)
+                # Generate description
+                msgs = [{'role': 'user', 'content': [frame_pil, "Describe the main action in this scene."]}]
+                description = self.model.chat(
+                    image=None,
+                    msgs=msgs,
+                    tokenizer=self.tokenizer
+                )
+                descriptions.append(description)
+            return descriptions
+        except Exception as e:
+            return [f"Error processing video: {str(e)}"]