Spaces:

mgbam
/

my-video-app

Sleeping

App Files Files Community

mgbam commited on about 1 month ago

Commit

ebc3520

verified ·

1 Parent(s): 08839d3

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -54

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import time
 import random
 import subprocess
 from pathlib import Path
 import google.generativeai as genai
 from tavily import TavilyClient
@@ -12,18 +13,18 @@ from runwayml import RunwayML, TaskFailedError
 from PIL import Image, ImageDraw, ImageFont
 # =============================================================
-# AI VIDEO STUDIO (Gen-4 Turbo Image→Video compliant rewrite)
 # =============================================================
-# Key changes:
-# 1. Added *required* prompt_image for Gen-4 / gen4_turbo image_to_video tasks (was missing -> error).
-# 2. Added UI input for an optional user keyframe image; if absent we auto-generate a placeholder.
-# 3. Included prompt_text together with prompt_image for better guidance.
-# 4. Added more robust polling / retry & explicit exception surfaces.
-# 5. Added structured logging + deterministic temp directory per job.
-# 6. Wrapped cleanup in finally; kept mock VO approach.
-# 7. Added basic safety guardrails.
-#
-# Gen-4 requires an input image plus text prompt (cannot be pure text alone) – if you want pure text-to-video, switch to Gen-3 Alpha text mode. See docs.
 # =============================================================
 # --- 1. CONFIGURE API KEYS ---
@@ -36,13 +37,15 @@ except KeyError as e:
     raise ValueError(f"API Key Error: Please set the {e} secret in your environment.")
 # --- 2. CONSTANTS / SETTINGS ---
-GEN4_MODEL = "gen4_turbo"   # adjust to "gen4" if you prefer (slower / potentially higher fidelity)
 SCENE_COUNT = 4
-SCENE_DURATION_SECONDS = 5  # Gen-4 supports 5 or 10 seconds
-VIDEO_RATIO = "1280:720"    # 16:9
-WORDS_PER_SEC = 2.5          # Used for mock narration length
-MAX_POLL_SECONDS = 180       # Per scene
 POLL_INTERVAL = 5
 # --- 3. UTILITIES ---
 def _log(msg: str):
@@ -50,20 +53,22 @@ def _log(msg: str):
 def create_placeholder_image(text: str, path: Path, size=(1280, 720)) -> Path:
-    """Create a simple placeholder keyframe if user supplies none.
-    You can later replace this with a real text-to-image generation step."""
     img = Image.new("RGB", size, (10, 10, 10))
     draw = ImageDraw.Draw(img)
     try:
         font = ImageFont.truetype("DejaVuSans-Bold.ttf", 60)
     except Exception:
         font = ImageFont.load_default()
-    wrapped = []
     line = ""
-    for word in text.split():
         test = f"{line} {word}".strip()
-        if len(test) > 28:  # naive wrap
-            wrapped.append(line)
             line = word
         else:
             line = test
@@ -100,12 +105,82 @@ def poll_runway_task(task_obj, max_seconds=MAX_POLL_SECONDS, interval=POLL_INTER
             raise TimeoutError(f"Runway task timed out after {max_seconds}s (status={status})")
         time.sleep(interval)
 # --- 4. CORE PIPELINE ---
 def generate_video_from_topic(topic_prompt, keyframe_image, progress=gr.Progress(track_tqdm=True)):
     job_id = f"{int(time.time())}_{random.randint(1000, 9999)}"
     _log(f"Starting job {job_id} :: topic='{topic_prompt}'")
-    # Working directory for this job
     workdir = Path(f"job_{job_id}")
     workdir.mkdir(exist_ok=True)
@@ -121,32 +196,15 @@ def generate_video_from_topic(topic_prompt, keyframe_image, progress=gr.Progress
                 search_depth="basic"
             )
             if research_results and 'results' in research_results:
-                facts = "\n".join([res['content'] for res in research_results['results']])
         except Exception as e:
             _log(f"Tavily failed: {e}")
         # STEP 2: Script
         progress(0.15, desc="✍️ Writing script ...")
-        gemini_model = genai.GenerativeModel('gemini-1.5-flash')
-        script_prompt = f"""
-        You are a creative director for viral short-form videos.
-        Topic: {topic_prompt}
-        Research (may contain noise):\n{facts}\n\n
-        Produce JSON with keys:
-        narration_script: overall narration (concise, energetic, ~85-110 words per 5 scenes). Maintain coherence.
-        scene_prompts: list of {SCENE_COUNT} *visual* prompts. Each should be cinematic, 1-2 sentences, include style / camera / lighting cues and keep characters consistent.
-        Return ONLY JSON.
-        """
-        response = gemini_model.generate_content(script_prompt)
-        try:
-            cleaned = response.text.strip().replace("```json", "").replace("```", "")
-            data = json.loads(cleaned)
-            narration = data['narration_script']
-            scene_prompts = data['scene_prompts']
-            if len(scene_prompts) != SCENE_COUNT:
-                raise ValueError(f"Expected {SCENE_COUNT} scene prompts, got {len(scene_prompts)}")
-        except Exception as e:
-            raise gr.Error(f"Gemini JSON parse error: {e}. Raw: {response.text[:400]}")
         # STEP 3: Mock VO
         progress(0.25, desc="🎙️ Generating mock VO ...")
@@ -154,7 +212,7 @@ def generate_video_from_topic(topic_prompt, keyframe_image, progress=gr.Progress
         generate_mock_voiceover(narration, audio_path)
         intermediates.append(audio_path)
-        # STEP 4: Prepare keyframe image (required for Gen-4 image_to_video)
         progress(0.30, desc="🖼️ Preparing keyframe image ...")
         if keyframe_image is not None:
             keyframe_path = Path(keyframe_image)
@@ -164,15 +222,15 @@ def generate_video_from_topic(topic_prompt, keyframe_image, progress=gr.Progress
         intermediates.append(keyframe_path)
         # STEP 5: Generate scenes
-        clip_paths = []
         for idx, scene_prompt in enumerate(scene_prompts, start=1):
             base_progress = 0.30 + (idx * 0.12)
             progress(min(base_progress, 0.85), desc=f"🎬 Scene {idx}/{len(scene_prompts)} ...")
-            _log(f"Submitting scene {idx}: {scene_prompt[:90]}...")
             try:
                 task = runway_client.image_to_video.create(
                     model=GEN4_MODEL,
-                    prompt_image=str(keyframe_path),  # required param
                     prompt_text=scene_prompt,
                     duration=SCENE_DURATION_SECONDS,
                     ratio=VIDEO_RATIO,
@@ -182,12 +240,12 @@ def generate_video_from_topic(topic_prompt, keyframe_image, progress=gr.Progress
             except TaskFailedError as e:
                 raise gr.Error(f"Runway failed scene {idx}: {getattr(e, 'task_details', 'No details')}")
-            # Download clip
             clip_path = workdir / f"scene_{idx}.mp4"
             r = runway_client._session.get(video_url, stream=True)
             with open(clip_path, 'wb') as f:
                 for chunk in r.iter_content(chunk_size=8192):
-                    if chunk: f.write(chunk)
             clip_paths.append(clip_path)
             intermediates.append(clip_path)
             _log(f"Downloaded scene {idx} -> {clip_path}")
@@ -197,7 +255,8 @@ def generate_video_from_topic(topic_prompt, keyframe_image, progress=gr.Progress
         list_file = workdir / "clips.txt"
         with open(list_file, 'w') as lf:
             for p in clip_paths:
-                lf.write(f"file '{p}'\n")
         intermediates.append(list_file)
         concat_path = workdir / f"concat_{job_id}.mp4"
@@ -221,7 +280,7 @@ def generate_video_from_topic(topic_prompt, keyframe_image, progress=gr.Progress
         _log(f"JOB {job_id} FAILED: {e}")
         raise gr.Error(f"An error occurred: {e}")
     finally:
-        # Keep workdir for debugging; comment out next block to remove entire directory
         pass
 # --- 5. GRADIO UI ---
@@ -243,7 +302,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         outputs=video_output
     )
-    gr.Markdown("---\n### Tips\n- Supply a consistent character/style image for more coherent scenes.\n- For pure *text-only* generation, switch to a Gen-3 Alpha text-to-video flow (not implemented here).\n- Replace placeholder keyframe logic with a real T2I model for higher quality.")
 if __name__ == "__main__":
-    demo.launch()

 import random
 import subprocess
 from pathlib import Path
+from typing import List, Any
 import google.generativeai as genai
 from tavily import TavilyClient
 from PIL import Image, ImageDraw, ImageFont
 # =============================================================
+# AI VIDEO STUDIO (Gen-4 Turbo Image→Video) – Robust Version
 # =============================================================
+# Improvements in this revision:
+# - Normalizes narration if model returns list (was causing list.split() AttributeError).
+# - Defensive checks & type coercion for scene_prompts.
+# - Safer JSON extraction (optionally attempts a JSON substring if extra text present).
+# - Fixed accidental newline handling for 'facts'.
+# - Added explicit JSON enforcement hint to Gemini.
+# - Added helper to truncate overly long narration.
+# - Added more granular progress steps & logging.
+# - Added retry for Gemini (transient failures) and for Runway polling.
+# - Added validate_scene_prompts() to guarantee list[str] length == SCENE_COUNT.
 # =============================================================
 # --- 1. CONFIGURE API KEYS ---
     raise ValueError(f"API Key Error: Please set the {e} secret in your environment.")
 # --- 2. CONSTANTS / SETTINGS ---
+GEN4_MODEL = "gen4_turbo"   # adjust to "gen4" for non‑turbo
 SCENE_COUNT = 4
+SCENE_DURATION_SECONDS = 5    # 5 or 10 supported
+VIDEO_RATIO = "1280:720"      # 16:9
+WORDS_PER_SEC = 2.5
+MAX_POLL_SECONDS = 180         # per scene
 POLL_INTERVAL = 5
+GEMINI_MAX_RETRIES = 2
+MAX_NARRATION_WORDS = 520      # safeguard length
 # --- 3. UTILITIES ---
 def _log(msg: str):
 def create_placeholder_image(text: str, path: Path, size=(1280, 720)) -> Path:
+    """Create a simple placeholder keyframe if user supplies none."""
     img = Image.new("RGB", size, (10, 10, 10))
     draw = ImageDraw.Draw(img)
     try:
         font = ImageFont.truetype("DejaVuSans-Bold.ttf", 60)
     except Exception:
         font = ImageFont.load_default()
+    # naive wrap
+    words = text.split()
+    wrapped: List[str] = []
     line = ""
+    for word in words:
         test = f"{line} {word}".strip()
+        if len(test) > 28:
+            if line:
+                wrapped.append(line)
             line = word
         else:
             line = test
             raise TimeoutError(f"Runway task timed out after {max_seconds}s (status={status})")
         time.sleep(interval)
+def extract_json_block(text: str) -> str:
+    """Attempt to isolate a JSON object in a noisy response."""
+    first = text.find('{')
+    last = text.rfind('}')
+    if first != -1 and last != -1 and last > first:
+        candidate = text[first:last+1]
+        return candidate
+    return text
+def coerce_narration(narr: Any) -> str:
+    if isinstance(narr, list):
+        narr = ' '.join(str(x) for x in narr)
+    if not isinstance(narr, str):
+        narr = str(narr)
+    words = narr.split()
+    if len(words) > MAX_NARRATION_WORDS:
+        narr = ' '.join(words[:MAX_NARRATION_WORDS])
+    return narr.strip()
+def validate_scene_prompts(sp: Any) -> List[str]:
+    if not isinstance(sp, list):
+        sp = [sp]
+    flat: List[str] = []
+    for item in sp:
+        if isinstance(item, list):
+            flat.extend(str(x) for x in item)
+        else:
+            flat.append(str(item))
+    # Trim or pad
+    if len(flat) < SCENE_COUNT:
+        flat.extend([flat[-1]] * (SCENE_COUNT - len(flat)))
+    if len(flat) > SCENE_COUNT:
+        flat = flat[:SCENE_COUNT]
+    return [s.strip() for s in flat]
+def call_gemini_script(topic: str, facts: str) -> tuple[str, List[str]]:
+    gemini_model = genai.GenerativeModel('gemini-1.5-flash')
+    script_prompt = f"""
+    You are a creative director for viral short-form educational videos.
+    Topic: {topic}
+    Research (may contain noise):
+{facts}
+    STRICT JSON OUTPUT ONLY. Do not add commentary or markdown fences.
+    Schema: {{"narration_script": string, "scene_prompts": list[{SCENE_COUNT}]}}
+    narration_script rules: energetic, cohesive, <= {MAX_NARRATION_WORDS} words total, no scene numbers.
+    scene_prompts: exactly {SCENE_COUNT} cinematic visual descriptions (1-2 sentences each) including style, camera, lighting.
+    Return JSON ONLY.
+    """
+    last_error = None
+    for attempt in range(GEMINI_MAX_RETRIES):
+        try:
+            response = gemini_model.generate_content(script_prompt)
+            raw = response.text.strip()
+            raw = raw.replace('```json', '').replace('```', '').strip()
+            raw = extract_json_block(raw)
+            data = json.loads(raw)
+            narration = coerce_narration(data.get('narration_script', ''))
+            scene_prompts = validate_scene_prompts(data.get('scene_prompts', []))
+            return narration, scene_prompts
+        except Exception as e:
+            last_error = e
+            time.sleep(1 + attempt)
+    raise ValueError(f"Gemini JSON parse failed after {GEMINI_MAX_RETRIES} attempts: {last_error}")
 # --- 4. CORE PIPELINE ---
 def generate_video_from_topic(topic_prompt, keyframe_image, progress=gr.Progress(track_tqdm=True)):
     job_id = f"{int(time.time())}_{random.randint(1000, 9999)}"
     _log(f"Starting job {job_id} :: topic='{topic_prompt}'")
     workdir = Path(f"job_{job_id}")
     workdir.mkdir(exist_ok=True)
                 search_depth="basic"
             )
             if research_results and 'results' in research_results:
+                facts = "
+".join(res.get('content', '') for res in research_results['results'])
         except Exception as e:
             _log(f"Tavily failed: {e}")
         # STEP 2: Script
         progress(0.15, desc="✍️ Writing script ...")
+        narration, scene_prompts = call_gemini_script(topic_prompt, facts)
+        _log(f"Narration words: {len(narration.split())}; scenes: {len(scene_prompts)}")
         # STEP 3: Mock VO
         progress(0.25, desc="🎙️ Generating mock VO ...")
         generate_mock_voiceover(narration, audio_path)
         intermediates.append(audio_path)
+        # STEP 4: Keyframe image (required for Gen-4 image_to_video)
         progress(0.30, desc="🖼️ Preparing keyframe image ...")
         if keyframe_image is not None:
             keyframe_path = Path(keyframe_image)
         intermediates.append(keyframe_path)
         # STEP 5: Generate scenes
+        clip_paths: List[Path] = []
         for idx, scene_prompt in enumerate(scene_prompts, start=1):
             base_progress = 0.30 + (idx * 0.12)
             progress(min(base_progress, 0.85), desc=f"🎬 Scene {idx}/{len(scene_prompts)} ...")
+            _log(f"Submitting scene {idx}: {scene_prompt[:100]} ...")
             try:
                 task = runway_client.image_to_video.create(
                     model=GEN4_MODEL,
+                    prompt_image=str(keyframe_path),  # required
                     prompt_text=scene_prompt,
                     duration=SCENE_DURATION_SECONDS,
                     ratio=VIDEO_RATIO,
             except TaskFailedError as e:
                 raise gr.Error(f"Runway failed scene {idx}: {getattr(e, 'task_details', 'No details')}")
             clip_path = workdir / f"scene_{idx}.mp4"
             r = runway_client._session.get(video_url, stream=True)
             with open(clip_path, 'wb') as f:
                 for chunk in r.iter_content(chunk_size=8192):
+                    if chunk:
+                        f.write(chunk)
             clip_paths.append(clip_path)
             intermediates.append(clip_path)
             _log(f"Downloaded scene {idx} -> {clip_path}")
         list_file = workdir / "clips.txt"
         with open(list_file, 'w') as lf:
             for p in clip_paths:
+                lf.write(f"file '{p}'
+")
         intermediates.append(list_file)
         concat_path = workdir / f"concat_{job_id}.mp4"
         _log(f"JOB {job_id} FAILED: {e}")
         raise gr.Error(f"An error occurred: {e}")
     finally:
+        # Keep workdir for debugging; remove manually when satisfied.
         pass
 # --- 5. GRADIO UI ---
         outputs=video_output
     )
+    gr.Markdown("---
+### Tips
+- Supply a consistent character/style image for more coherent scenes.
+- Gen-4 requires an input image + (optional) text prompt; pure text alone is not supported in this flow.
+- For pure text-to-video consider a Gen-3 text model.
+- Replace placeholder keyframe logic with a real T2I model for higher quality.")
 if __name__ == "__main__":
+    demo.launch()