Spaces:

mgbam
/

my-video-app

Sleeping

App Files Files Community

mgbam commited on Jul 20

Commit

3c12225

verified ·

1 Parent(s): 702fd23

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -125

app.py CHANGED Viewed

@@ -1,14 +1,25 @@
 """
 AI Video Studio (Runway Gen-4 / Gen-4 Turbo + Gemini + Tavily + ElevenLabs + Runway Audio Fallback)
 Features:
-- Quality Mode: choose 'gen4' (higher fidelity) or 'gen4_turbo' (faster iteration).
-- Structured script & scene prompt generation with schema enforcement.
-- Multi-keyframe support (user can upload multiple images; otherwise placeholder).
-- Aspect ratio validation & optional auto-crop to closest supported ratio.
-- ElevenLabs voice pagination, retry & diagnostics; streaming or batch TTS.
-- Runway Generative Audio fallback if ElevenLabs fails or no voices.
-- Automatic per-clip sharpness heuristic & re-generation (one retry) for low-detail clips.
-- Prompt enhancer injecting consistent global style; per-scene Subject|Action|Camera|Lighting|Mood|Style template.
 """
 import os
@@ -18,24 +29,24 @@ import random
 import logging
 import subprocess
 import base64
-import math
 from pathlib import Path
-from typing import List, Dict, Any, Optional, Tuple
 import gradio as gr
 from PIL import Image, ImageDraw, ImageFont, ImageFilter
 import numpy as np
 import google.generativeai as genai
 from tavily import TavilyClient
 from runwayml import RunwayML
 import httpx
-# ---- ElevenLabs (version-agnostic error import) ----
 try:
     from elevenlabs import ElevenLabs
     try:
-        from elevenlabs.errors import ApiError  # may vary by version
     except Exception:
         ApiError = Exception
 except ImportError:
@@ -55,15 +66,14 @@ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 RUNWAY_KEY = os.getenv("RUNWAY_API_KEY") or os.getenv("RUNWAYML_API_SECRET")
 ELEVEN_KEY = os.getenv("ELEVENLABS_API_KEY") or os.getenv("XI_API_KEY")
-RUNWAY_AUDIO_FALLBACK = True  # toggle fallback usage
-missing = [k for k, v in {
     "GEMINI_API_KEY": GEMINI_API_KEY,
     "TAVILY_API_KEY": TAVILY_API_KEY,
     "RUNWAY_API_KEY": RUNWAY_KEY
 }.items() if not v]
-if missing:
-    raise RuntimeError(f"Missing required API keys: {', '.join(missing)}")
 genai.configure(api_key=GEMINI_API_KEY)
 tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
@@ -73,8 +83,8 @@ eleven_client = ElevenLabs(api_key=ELEVEN_KEY) if (ELEVEN_KEY and ElevenLabs) el
 # ---------------- Constants ----------------
 DEFAULT_SCENES = 4
 MAX_SCENES = 8
-ALLOWED_DURATIONS = {5, 10}
-SUPPORTED_RATIOS = {"1280:720", "1584:672", "1104:832", "720:1280", "832:1104", "960:960"}
 WORDS_PER_SEC = 2.5
 PLACEHOLDER_BG = (16, 18, 24)
 PLACEHOLDER_FG = (240, 242, 248)
@@ -82,8 +92,13 @@ FONT_CANDIDATES = [
     "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
     "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
 ]
-SHARPNESS_MIN = 0.015  # empirical edge density threshold
 RETRY_DETAIL_SUFFIX = "ultra-detailed textures, crisp focus, refined edges"
 # ---------------- Utility ----------------
 def uid() -> str:
@@ -93,7 +108,7 @@ def sanitize_filename(name: str) -> str:
     safe = "".join(c for c in name if c.isalnum() or c in ("-","_"))[:60]
     return safe or "video"
-def load_font(size: int = 42):
     for p in FONT_CANDIDATES:
         if Path(p).exists():
             try:
@@ -116,12 +131,13 @@ def generate_placeholder_image(topic: str, width=768, height=432) -> str:
         else:
             line.append(w)
     if line: lines.append(" ".join(line))
-    total_h = 0
-    metrics=[]
     for ln in lines:
         bbox = draw.textbbox((0,0), ln, font=font)
         h=bbox[3]-bbox[1]
-        metrics.append((ln,h,bbox)); total_h += h+12
     y=(height-total_h)//2
     for ln,h,bbox in metrics:
         w=bbox[2]-bbox[0]
@@ -132,36 +148,28 @@ def generate_placeholder_image(topic: str, width=768, height=432) -> str:
     img.save(out)
     return out
-def aspect_ratio_of(img: Image.Image) -> str:
-    w,h=img.size
-    return f"{w}:{h}"
 def closest_supported_ratio(w: int, h: int) -> str:
-    # choose ratio minimizing relative area crop after scaling
     candidates=[]
     for r in SUPPORTED_RATIOS:
-        rw,rh = map(int, r.split(":"))
-        target_ratio = rw / rh
-        cur_ratio = w / h
-        diff = abs(target_ratio - cur_ratio)
         candidates.append((diff,r))
     candidates.sort()
     return candidates[0][1]
 def crop_to_ratio(img: Image.Image, ratio: str) -> Image.Image:
-    rw,rh=map(int,ratio.split(":"))
-    target=rw/rh
-    w,h=img.size
-    cur=w/h
-    if abs(cur-target) < 1e-3:
         return img
-    if cur>target:
-        # too wide
         new_w=int(target*h)
         x0=(w-new_w)//2
         return img.crop((x0,0,x0+new_w,h))
-    else:
-        # too tall
         new_h=int(w/target)
         y0=(h-new_h)//2
         return img.crop((0,y0,w,y0+new_h))
@@ -183,6 +191,9 @@ def research_topic(topic: str) -> str:
 # ---------------- Gemini Script Generation ----------------
 def gemini_script(topic: str, facts: str, scene_count: int) -> Dict[str,Any]:
     prompt = f"""
 You are a creative director.
@@ -202,19 +213,19 @@ Return STRICT JSON:
       "lighting": "...",
       "mood": "...",
       "style": "...",
-      "prompt": "<final merged scene prompt (<=40 words)>"
     }}
-    (exactly {scene_count} objects total)
   ]
 }}
 Rules:
-- subject/action focus on continuity of main subject.
-- camera gives ONE motion (e.g. "slow dolly in", "handheld pan", "aerial sweep").
-- lighting (e.g. "golden hour rim light", "soft volumetric interior").
-- mood (emotion / tone).
-- style (cinematic descriptors, film grain, color palette words).
-- prompt MUST integrate all fields succinctly; no numbering; no markdown.
 """
     model = genai.GenerativeModel("gemini-1.5-flash")
     response = model.generate_content(prompt)
@@ -240,7 +251,7 @@ Rules:
     norm=[]
     for sc in scenes:
         if not isinstance(sc,dict): continue
-        prompt_txt = sc.get("prompt") or "Cinematic establishing shot"
         norm.append({
             "subject": sc.get("subject",""),
             "action": sc.get("action",""),
@@ -255,7 +266,7 @@ Rules:
             "subject":"main subject",
             "action":"subtle motion",
             "camera":"slow dolly in",
-            "lighting":"soft directional light",
             "mood":"cinematic",
             "style":"filmic grain",
             "prompt":f"Cinematic slow dolly in of main subject, soft directional light, filmic grain, {topic}"
@@ -263,8 +274,8 @@ Rules:
     norm=norm[:scene_count]
     return {"narration": narration, "scenes": norm}
-# ---------------- ElevenLabs ----------------
-def fetch_voices_paginated(max_pages=5, page_size=50, delay=0.6) -> List[Dict[str,str]]:
     if not eleven_client:
         return []
     voices=[]
@@ -282,16 +293,20 @@ def fetch_voices_paginated(max_pages=5, page_size=50, delay=0.6) -> List[Dict[st
         if not token:
             break
         time.sleep(delay)
     return voices
 def tts_elevenlabs(text: str, voice_id: str, model_id: str,
                    stability: float, similarity: float,
                    style: float, speaker_boost: bool,
                    streaming: bool, out_path: str) -> bool:
-    if not eleven_client or not voice_id:
         return False
     try:
-        # clamp
         stability=max(0,min(1,stability))
         similarity=max(0,min(1,similarity))
         style=max(0,min(1,style))
@@ -320,6 +335,10 @@ def tts_elevenlabs(text: str, voice_id: str, model_id: str,
             )
             with open(out_path,"wb") as f:
                 f.write(audio)
         return True
     except ApiError as e:
         log.error(f"ElevenLabs ApiError: {e}")
@@ -327,17 +346,11 @@ def tts_elevenlabs(text: str, voice_id: str, model_id: str,
         log.error(f"ElevenLabs TTS error: {e}")
     return False
-# ---------------- Runway Audio Fallback ----------------
-def runway_generate_audio(text: str, out_path: str) -> bool:
-    """
-    Simple fallback using Runway Generative Audio (pseudo-endpoint placeholder).
-    NOTE: Replace with official SDK call if/when available in your Python client.
-    """
     if not RUNWAY_AUDIO_FALLBACK:
         return False
     try:
-        # Placeholder logic: here we just synthesize silence to keep pipeline moving.
-        # (Integrate actual Runway audio generation when SDK exposes it.)
         duration = max(2.0, min(300.0, len(text.split())/WORDS_PER_SEC))
         subprocess.run([
             "ffmpeg","-f","lavfi","-i","anullsrc=r=44100:cl=mono",
@@ -349,7 +362,6 @@ def runway_generate_audio(text: str, out_path: str) -> bool:
         log.error(f"Runway audio fallback failed: {e}")
         return False
-# ---------------- Mock / Silent Fallback ----------------
 def silent_track(narration: str, out_path: str):
     duration = max(2.0, min(300.0, len(narration.split())/WORDS_PER_SEC))
     subprocess.run([
@@ -368,7 +380,7 @@ def runway_generate_clip(model: str, prompt_image: str, text_prompt: str,
             prompt_text=text_prompt,
             duration=duration,
             ratio=ratio
-        )
     except Exception as e:
         raise gr.Error(f"Runway task creation failed: {e}")
@@ -397,34 +409,19 @@ def runway_generate_clip(model: str, prompt_image: str, text_prompt: str,
 # ---------------- Sharpness Heuristic ----------------
 def clip_edge_density(path: str) -> float:
     try:
-        import cv2  # optional optimization; if unavailable fallback to PIL
-        cap = cv2.VideoCapture(path)
-        if not cap.isOpened(): return 1.0
-        frames=0; acc=0.0
-        while frames<10:
-            ret, frame = cap.read()
-            if not ret: break
-            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-            edges = cv2.Canny(gray,100,200)
-            acc += edges.mean()/255.0
-            frames+=1
-        cap.release()
-        return acc/max(frames,1)
     except Exception:
-        # PIL fallback (single frame)
-        try:
-            # extract a frame via ffmpeg
-            tmp = f"frame_{uid()}.png"
-            subprocess.run(["ffmpeg","-i",path,"-vf","scale=320:-1","-vframes","1",tmp,"-y"],
-                           stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
-            img = Image.open(tmp).convert("L")
-            arr = np.array(img.filter(ImageFilter.FIND_EDGES))
-            val = arr.mean()/255.0
-            os.remove(tmp)
-            return val
-        except Exception:
-            return 1.0  # assume ok if cannot measure
 # ---------------- Concatenate & Mux ----------------
 def concat_and_mux(video_paths: List[str], audio_path: str, out_path: str):
@@ -445,18 +442,18 @@ def concat_and_mux(video_paths: List[str], audio_path: str, out_path: str):
         try: os.remove(p)
         except OSError: pass
-# ---------------- Global Style ----------------
-GLOBAL_STYLE = "cinematic, cohesive composition, natural volumetric light, filmic color grade, gentle motion, high detail"
 def build_scene_prompt(sc: Dict[str,str]) -> str:
     base = f"{sc['subject']} {sc['action']}, {sc['camera']}, {sc['lighting']}, {sc['mood']}, {sc['style']}"
-    merged = sc.get("prompt") or base
-    return f"{merged}. {GLOBAL_STYLE}"
 # ---------------- Main Pipeline ----------------
 def generate_video(
     topic: str,
-    keyframes: list,          # list of file paths
     scene_count: int,
     clip_duration: int,
     ratio: str,
@@ -479,8 +476,7 @@ def generate_video(
         scene_count = max(1,min(MAX_SCENES,scene_count))
         if clip_duration not in ALLOWED_DURATIONS:
             clip_duration=5
-        # choose model
-        runway_model = "gen4" if quality_mode else "gen4_turbo"
         progress(0.05, desc="🔍 Researching...")
         facts = research_topic(topic)
@@ -494,9 +490,12 @@ def generate_video(
         audio_path=f"narration_{job}.mp3"
         temp_files.append(audio_path)
-        voice_id=""
         if voice_choice and "|" in voice_choice:
-            voice_id = voice_choice.split("|",1)[1]
         tts_ok=False
         if ELEVEN_KEY and voice_id:
@@ -506,15 +505,14 @@ def generate_video(
                 streaming_tts, audio_path
             )
         if not tts_ok and RUNWAY_AUDIO_FALLBACK:
-            tts_ok = runway_generate_audio(narration, audio_path)
         if not tts_ok:
             silent_track(narration, audio_path)
         progress(0.40, desc="🖼️ Preparing keyframes...")
-        # Handle multi-keyframe: if multiple, cycle through them; else create placeholder
         loaded_keyframes=[]
         if keyframes:
-            for fp in keyframes:
                 try:
                     img=Image.open(fp).convert("RGB")
                     loaded_keyframes.append(img)
@@ -525,31 +523,28 @@ def generate_video(
             temp_files.append(placeholder)
             loaded_keyframes=[Image.open(placeholder).convert("RGB")]
-        # Ratio handling
         if ratio not in SUPPORTED_RATIOS:
             ratio_choice = closest_supported_ratio(*loaded_keyframes[0].size)
         else:
             ratio_choice = ratio
-        processed_images=[]
         for img in loaded_keyframes:
-            proc = crop_to_ratio(img, ratio_choice)
-            processed_images.append(proc)
-        # Convert processed images to data URIs
         data_uris=[]
-        for img in processed_images:
-            b = bytes()
-            from io import BytesIO
             buf=BytesIO()
             img.save(buf, format="PNG")
-            b=buf.getvalue()
-            data_uris.append("data:image/png;base64,"+base64.b64encode(b).decode("utf-8"))
         video_clips=[]
         for idx, sc in enumerate(scene_objs, start=1):
             progress(0.40 + 0.45*idx/scene_count,
                      desc=f"🎬 Scene {idx}/{scene_count}...")
-            img_uri = data_uris[(idx-1) % len(data_uris)]
             prompt_text = build_scene_prompt(sc)
             clip_path = runway_generate_clip(
                 model=runway_model,
@@ -560,7 +555,6 @@ def generate_video(
             )
             video_clips.append(clip_path); temp_files.append(clip_path)
-            # Sharpness check
             sharp = clip_edge_density(clip_path)
             if sharp < SHARPNESS_MIN:
                 log.info(f"Scene {idx} low sharpness ({sharp:.4f}) - retrying with detail boost")
@@ -606,10 +600,9 @@ def refresh_voices():
 # ---------------- Gradio Interface ----------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🎬 AI Video Studio (Gen-4 / Turbo + Gemini + ElevenLabs + Runway Audio)")
     gr.Markdown(
-        "Iterate quickly with Turbo, then switch to Quality Mode for final fidelity. "
-        "Upload multiple keyframes to improve subject consistency."
     )
     with gr.Row():
@@ -620,9 +613,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         scene_count = gr.Slider(1, MAX_SCENES, value=DEFAULT_SCENES, step=1, label="Scenes")
         clip_duration = gr.Radio(choices=sorted(list(ALLOWED_DURATIONS)), value=5, label="Seconds/Scene")
         ratio = gr.Dropdown(choices=sorted(list(SUPPORTED_RATIOS)), value="1280:720", label="Aspect Ratio")
-        quality_mode = gr.Checkbox(label="Quality Mode (use gen4 instead of gen4_turbo)", value=False)
-    gr.Markdown("### Narration (Primary: ElevenLabs, Fallback: Runway Audio / Silence)")
     with gr.Row():
         refresh_btn = gr.Button("🔄 Refresh Voices")
         voices_dd = gr.Dropdown(choices=[], label="ElevenLabs Voice (Name|ID)")
@@ -656,10 +649,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         "### Tips\n"
-        "- Use multiple high-quality keyframes (consistent character & environment).\n"
-        "- Refine camera verbs (slow dolly in, handheld pan, aerial sweep) & lighting adjectives.\n"
-        "- Toggle Quality Mode only when you like the blocking to save credits.\n"
-        "- Add emotional descriptors directly in narration text for richer delivery."
     )
 if __name__ == '__main__':

 """
 AI Video Studio (Runway Gen-4 / Gen-4 Turbo + Gemini + Tavily + ElevenLabs + Runway Audio Fallback)
 Features:
+- Quality Mode: choose 'gen4' (higher fidelity) or 'gen4_turbo' (faster iteration). Gen-4 / Turbo accept 5s or 10s durations only.
+- Structured scene schema (Subject | Action | Camera | Lighting | Mood | Style) -> merged prompt.
+- Multi-keyframe support (upload 1–4 images); automatic ratio cropping to supported Runway aspect ratios.
+- ElevenLabs TTS with: pagination, retry, streaming/non-streaming, adjustable stability/similarity/style/speaker boost.
+- Hard fallback default voice ID (env ELEVEN_DEFAULT_VOICE_ID) if dropdown fetch fails.
+- Runway audio silent fallback placeholder (stub) if all TTS fails (replace later with real Runway audio call if available).
+- Sharpness (edge density) heuristic; one automatic re-generation with detail suffix for blurry clips.
+- Clean temporary file housekeeping; robust logging & progress reporting.
+Environment Variables (required):
+    GEMINI_API_KEY
+    TAVILY_API_KEY
+    RUNWAY_API_KEY  (or RUNWAYML_API_SECRET)
+Optional:
+    ELEVENLABS_API_KEY (or XI_API_KEY)
+    ELEVEN_DEFAULT_VOICE_ID  (fallback voice id)
+Security: NEVER hard-code real API keys in this file.
 """
 import os
 import logging
 import subprocess
 import base64
 from pathlib import Path
+from typing import List, Dict, Any, Optional
 import gradio as gr
 from PIL import Image, ImageDraw, ImageFont, ImageFilter
 import numpy as np
+# External SDKs
 import google.generativeai as genai
 from tavily import TavilyClient
 from runwayml import RunwayML
 import httpx
+# ---- ElevenLabs (version-agnostic import) ----
 try:
     from elevenlabs import ElevenLabs
     try:
+        from elevenlabs.errors import ApiError  # may not exist in some versions
     except Exception:
         ApiError = Exception
 except ImportError:
 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 RUNWAY_KEY = os.getenv("RUNWAY_API_KEY") or os.getenv("RUNWAYML_API_SECRET")
 ELEVEN_KEY = os.getenv("ELEVENLABS_API_KEY") or os.getenv("XI_API_KEY")
+required_missing = [k for k, v in {
     "GEMINI_API_KEY": GEMINI_API_KEY,
     "TAVILY_API_KEY": TAVILY_API_KEY,
     "RUNWAY_API_KEY": RUNWAY_KEY
 }.items() if not v]
+if required_missing:
+    raise RuntimeError(f"Missing required API keys: {', '.join(required_missing)}")
 genai.configure(api_key=GEMINI_API_KEY)
 tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
 # ---------------- Constants ----------------
 DEFAULT_SCENES = 4
 MAX_SCENES = 8
+ALLOWED_DURATIONS = {5, 10}  # Runway Gen-4 / Turbo durations (5 or 10 seconds) :contentReference[oaicite:0]{index=0}:contentReference[oaicite:1]{index=1}
+SUPPORTED_RATIOS = {"1280:720", "1584:672", "1104:832", "720:1280", "832:1104", "960:960"}  # documented multiple aspect ratios :contentReference[oaicite:2]{index=2}
 WORDS_PER_SEC = 2.5
 PLACEHOLDER_BG = (16, 18, 24)
 PLACEHOLDER_FG = (240, 242, 248)
     "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
     "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
 ]
+SHARPNESS_MIN = 0.015
 RETRY_DETAIL_SUFFIX = "ultra-detailed textures, crisp focus, refined edges"
+GLOBAL_STYLE = "cinematic, cohesive composition, natural volumetric light, filmic color grade, gentle camera motion, high detail"
+# Fallback ElevenLabs voice ID (replace with your own or set env var)
+DEFAULT_ELEVEN_VOICE_ID = os.getenv("ELEVEN_DEFAULT_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")  # example/published sample id
+RUNWAY_AUDIO_FALLBACK = True  # Placeholder stub (replace with real Runway audio generation when available)
 # ---------------- Utility ----------------
 def uid() -> str:
     safe = "".join(c for c in name if c.isalnum() or c in ("-","_"))[:60]
     return safe or "video"
+def load_font(size: int = 44):
     for p in FONT_CANDIDATES:
         if Path(p).exists():
             try:
         else:
             line.append(w)
     if line: lines.append(" ".join(line))
+    # center vertically
+    metrics=[]; total_h=0
     for ln in lines:
         bbox = draw.textbbox((0,0), ln, font=font)
         h=bbox[3]-bbox[1]
+        metrics.append((ln,h,bbox))
+        total_h += h+12
     y=(height-total_h)//2
     for ln,h,bbox in metrics:
         w=bbox[2]-bbox[0]
     img.save(out)
     return out
 def closest_supported_ratio(w: int, h: int) -> str:
     candidates=[]
+    cur_ratio = w / h
     for r in SUPPORTED_RATIOS:
+        rw,rh = map(int,r.split(":"))
+        diff = abs(cur_ratio - (rw/rh))
         candidates.append((diff,r))
     candidates.sort()
     return candidates[0][1]
 def crop_to_ratio(img: Image.Image, ratio: str) -> Image.Image:
+    rw,rh = map(int, ratio.split(":"))
+    target = rw / rh
+    w,h = img.size
+    cur = w / h
+    if abs(cur-target)<1e-3:
         return img
+    if cur>target:  # too wide
         new_w=int(target*h)
         x0=(w-new_w)//2
         return img.crop((x0,0,x0+new_w,h))
+    else:          # too tall
         new_h=int(w/target)
         y0=(h-new_h)//2
         return img.crop((0,y0,w,y0+new_h))
 # ---------------- Gemini Script Generation ----------------
 def gemini_script(topic: str, facts: str, scene_count: int) -> Dict[str,Any]:
+    """
+    Request structured JSON with narration + scene objects containing schema fields.
+    """
     prompt = f"""
 You are a creative director.
       "lighting": "...",
       "mood": "...",
       "style": "...",
+      "prompt": "<merged scene prompt (<=40 words)>"
     }}
+    (exactly {scene_count} objects)
   ]
 }}
 Rules:
+- Keep one consistent main subject across scenes unless evolution is explicitly helpful.
+- camera: ONE motion (e.g. "slow dolly in", "handheld pan", "aerial sweep").
+- lighting: descriptive & cinematic (e.g. "golden hour rim light").
+- style: filmic adjectives (e.g. "35mm film grain, rich color palette").
+- merged prompt must integrate key fields succinctly.
+- No markdown, no lists, no commentary outside JSON.
 """
     model = genai.GenerativeModel("gemini-1.5-flash")
     response = model.generate_content(prompt)
     norm=[]
     for sc in scenes:
         if not isinstance(sc,dict): continue
+        prompt_txt = sc.get("prompt") or ""
         norm.append({
             "subject": sc.get("subject",""),
             "action": sc.get("action",""),
             "subject":"main subject",
             "action":"subtle motion",
             "camera":"slow dolly in",
+            "lighting":"soft directional key light",
             "mood":"cinematic",
             "style":"filmic grain",
             "prompt":f"Cinematic slow dolly in of main subject, soft directional light, filmic grain, {topic}"
     norm=norm[:scene_count]
     return {"narration": narration, "scenes": norm}
+# ---------------- ElevenLabs Voice Handling ----------------
+def fetch_voices_paginated(max_pages=5, page_size=50, delay=0.5) -> List[Dict[str,str]]:
     if not eleven_client:
         return []
     voices=[]
         if not token:
             break
         time.sleep(delay)
+    log.info(f"Fetched {len(voices)} ElevenLabs voices.")
     return voices
 def tts_elevenlabs(text: str, voice_id: str, model_id: str,
                    stability: float, similarity: float,
                    style: float, speaker_boost: bool,
                    streaming: bool, out_path: str) -> bool:
+    if not eleven_client:
+        log.warning("ElevenLabs client not initialized.")
+        return False
+    if not voice_id:
+        log.warning("No voice_id provided for TTS.")
         return False
     try:
         stability=max(0,min(1,stability))
         similarity=max(0,min(1,similarity))
         style=max(0,min(1,style))
             )
             with open(out_path,"wb") as f:
                 f.write(audio)
+        # sanity size check
+        if os.path.getsize(out_path) < 800:
+            log.error("ElevenLabs audio too small; treating as failure.")
+            return False
         return True
     except ApiError as e:
         log.error(f"ElevenLabs ApiError: {e}")
         log.error(f"ElevenLabs TTS error: {e}")
     return False
+# ---------------- Runway Audio Fallback (placeholder silent track) ----------------
+def runway_audio_fallback(text: str, out_path: str) -> bool:
     if not RUNWAY_AUDIO_FALLBACK:
         return False
     try:
         duration = max(2.0, min(300.0, len(text.split())/WORDS_PER_SEC))
         subprocess.run([
             "ffmpeg","-f","lavfi","-i","anullsrc=r=44100:cl=mono",
         log.error(f"Runway audio fallback failed: {e}")
         return False
 def silent_track(narration: str, out_path: str):
     duration = max(2.0, min(300.0, len(narration.split())/WORDS_PER_SEC))
     subprocess.run([
             prompt_text=text_prompt,
             duration=duration,
             ratio=ratio
+        )  # API pattern for gen4 / turbo image-to-video :contentReference[oaicite:3]{index=3}:contentReference[oaicite:4]{index=4}
     except Exception as e:
         raise gr.Error(f"Runway task creation failed: {e}")
 # ---------------- Sharpness Heuristic ----------------
 def clip_edge_density(path: str) -> float:
+    # Quick heuristic using FFmpeg + PIL (avoid heavy deps if opencv absent)
     try:
+        tmp = f"frame_{uid()}.png"
+        subprocess.run([
+            "ffmpeg","-i",path,"-vf","scale=320:-1","-vframes","1",tmp,"-y"
+        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
+        img = Image.open(tmp).convert("L")
+        arr = np.array(img.filter(ImageFilter.FIND_EDGES))
+        val = arr.mean()/255.0
+        os.remove(tmp)
+        return val
     except Exception:
+        return 1.0  # assume acceptable if analysis fails
 # ---------------- Concatenate & Mux ----------------
 def concat_and_mux(video_paths: List[str], audio_path: str, out_path: str):
         try: os.remove(p)
         except OSError: pass
+# ---------------- Prompt Assembly ----------------
 def build_scene_prompt(sc: Dict[str,str]) -> str:
+    merged = sc.get("prompt") or ""
+    if merged:
+        return f"{merged}. {GLOBAL_STYLE}"
     base = f"{sc['subject']} {sc['action']}, {sc['camera']}, {sc['lighting']}, {sc['mood']}, {sc['style']}"
+    return f"{base}. {GLOBAL_STYLE}"
 # ---------------- Main Pipeline ----------------
 def generate_video(
     topic: str,
+    keyframes: list,
     scene_count: int,
     clip_duration: int,
     ratio: str,
         scene_count = max(1,min(MAX_SCENES,scene_count))
         if clip_duration not in ALLOWED_DURATIONS:
             clip_duration=5
+        runway_model = "gen4" if quality_mode else "gen4_turbo"  # trade speed vs fidelity :contentReference[oaicite:5]{index=5}:contentReference[oaicite:6]{index=6}
         progress(0.05, desc="🔍 Researching...")
         facts = research_topic(topic)
         audio_path=f"narration_{job}.mp3"
         temp_files.append(audio_path)
+        # Determine voice id (UI or default fallback)
         if voice_choice and "|" in voice_choice:
+            voice_id = voice_choice.split("|",1)[1].strip()
+        else:
+            voice_id = DEFAULT_ELEVEN_VOICE_ID
+        log.info(f"[JOB {job}] Using voice_id='{voice_id}' model_id='{model_id}' (quality={quality_mode})")
         tts_ok=False
         if ELEVEN_KEY and voice_id:
                 streaming_tts, audio_path
             )
         if not tts_ok and RUNWAY_AUDIO_FALLBACK:
+            tts_ok = runway_audio_fallback(narration, audio_path)
         if not tts_ok:
             silent_track(narration, audio_path)
         progress(0.40, desc="🖼️ Preparing keyframes...")
         loaded_keyframes=[]
         if keyframes:
+            for fp in keyframes[:4]:
                 try:
                     img=Image.open(fp).convert("RGB")
                     loaded_keyframes.append(img)
             temp_files.append(placeholder)
             loaded_keyframes=[Image.open(placeholder).convert("RGB")]
         if ratio not in SUPPORTED_RATIOS:
             ratio_choice = closest_supported_ratio(*loaded_keyframes[0].size)
         else:
             ratio_choice = ratio
+        processed=[]
         for img in loaded_keyframes:
+            processed.append(crop_to_ratio(img, ratio_choice))
+        # Data URIs for Runway image_to_video
         data_uris=[]
+        from io import BytesIO
+        for img in processed:
             buf=BytesIO()
             img.save(buf, format="PNG")
+            data_uris.append("data:image/png;base64,"+base64.b64encode(buf.getvalue()).decode("utf-8"))
         video_clips=[]
         for idx, sc in enumerate(scene_objs, start=1):
             progress(0.40 + 0.45*idx/scene_count,
                      desc=f"🎬 Scene {idx}/{scene_count}...")
+            img_uri = data_uris[(idx-1)%len(data_uris)]
             prompt_text = build_scene_prompt(sc)
             clip_path = runway_generate_clip(
                 model=runway_model,
             )
             video_clips.append(clip_path); temp_files.append(clip_path)
             sharp = clip_edge_density(clip_path)
             if sharp < SHARPNESS_MIN:
                 log.info(f"Scene {idx} low sharpness ({sharp:.4f}) - retrying with detail boost")
 # ---------------- Gradio Interface ----------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🎬 AI Video Studio (Gen-4 / Turbo + Gemini + ElevenLabs)")
     gr.Markdown(
+        "Iterate with Turbo, finalize with Gen-4. Upload up to 4 keyframes for stronger subject consistency."
     )
     with gr.Row():
         scene_count = gr.Slider(1, MAX_SCENES, value=DEFAULT_SCENES, step=1, label="Scenes")
         clip_duration = gr.Radio(choices=sorted(list(ALLOWED_DURATIONS)), value=5, label="Seconds/Scene")
         ratio = gr.Dropdown(choices=sorted(list(SUPPORTED_RATIOS)), value="1280:720", label="Aspect Ratio")
+        quality_mode = gr.Checkbox(label="Quality Mode (gen4 vs gen4_turbo)", value=False)
+    gr.Markdown("### Narration (ElevenLabs primary; fallback silent track)")
     with gr.Row():
         refresh_btn = gr.Button("🔄 Refresh Voices")
         voices_dd = gr.Dropdown(choices=[], label="ElevenLabs Voice (Name|ID)")
     gr.Markdown(
         "### Tips\n"
+        "- Use detailed keyframes with clear subject & lighting.\n"
+        "- Add emotional descriptors directly in narration text for richer prosody.\n"
+        "- Iterate with Turbo then switch to Quality Mode to finalize.\n"
+        "- Adjust Stability/Similarity for expressiveness vs consistency."
     )
 if __name__ == '__main__':