Spaces:

mgbam
/

my-video-app

Sleeping

App Files Files Community

mgbam commited on 28 days ago

Commit

9c2d4ce

verified ·

1 Parent(s): 0c28ab5

Update app.py

Browse files

Files changed (1) hide show

app.py +253 -207

app.py CHANGED Viewed

@@ -12,12 +12,14 @@ import gradio as gr
 from PIL import Image, ImageDraw, ImageFont
 # External SDKs
-import google.generativeai as genai              # Gemini
-from tavily import TavilyClient                  # Research enrichment
-from runwayml import RunwayML, TaskFailedError   # Runway official SDK
-from elevenlabs import ElevenLabs, VoiceSettings # ElevenLabs official SDK
-# ---------------- Logging Setup ----------------
 logging.basicConfig(
     level=logging.INFO,
     format="[%(levelname)s %(asctime)s] %(message)s",
@@ -25,113 +27,113 @@ logging.basicConfig(
 )
 log = logging.getLogger("ai_video_studio")
-# ---------------- Configuration & Keys ----------------
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 RUNWAY_KEY = os.getenv("RUNWAY_API_KEY") or os.getenv("RUNWAYML_API_SECRET")
-ELEVEN_KEY = os.getenv("ELEVENLABS_API_KEY") or os.getenv("ELEVEN_API_KEY")
-REQUIRED = {
     "GEMINI_API_KEY": GEMINI_API_KEY,
     "TAVILY_API_KEY": TAVILY_API_KEY,
-    "RUNWAY_API_KEY / RUNWAYML_API_SECRET": RUNWAY_KEY,
-}
-missing = [k for k, v in REQUIRED.items() if not v]
 if missing:
     raise RuntimeError(f"Missing required API keys: {', '.join(missing)}")
-# ElevenLabs is optional; if absent we fall back to mock audio.
-ELEVEN_AVAILABLE = bool(ELEVEN_KEY)
-# Configure clients
 genai.configure(api_key=GEMINI_API_KEY)
 tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
 runway_client = RunwayML(api_key=RUNWAY_KEY)
-eleven_client: Optional[ElevenLabs] = ElevenLabs(api_key=ELEVEN_KEY) if ELEVEN_AVAILABLE else None
 # ---------------- Constants ----------------
 DEFAULT_SCENES = 4
 MAX_SCENES = 8
-WORDS_PER_SEC = 2.5  # heuristic for mock silent track
-ALLOWED_DURATIONS = {5, 10}
 PLACEHOLDER_BG = (18, 18, 22)
 PLACEHOLDER_FG = (239, 239, 245)
 FONT_CANDIDATES = [
     "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
     "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
 ]
-GLOBAL_STYLE = (
-    "Cinematic, natural volumetric light, subtle camera motion, high coherence, 4k texture detail"
-)
-# ---------------- Utility Functions ----------------
 def uid() -> str:
     return f"{int(time.time())}_{random.randint(1000, 9999)}"
 def sanitize_filename(name: str) -> str:
-    safe = "".join(c for c in name if c.isalnum() or c in ("-", "_"))[:64]
     return safe or "video"
 def generate_placeholder_image(topic: str, width: int = 768, height: int = 432) -> str:
     img = Image.new("RGB", (width, height), PLACEHOLDER_BG)
     draw = ImageDraw.Draw(img)
     font = None
     for path in FONT_CANDIDATES:
         if Path(path).exists():
             try:
-                font = ImageFont.truetype(path, 44)
                 break
             except Exception:
                 pass
     if font is None:
         font = ImageFont.load_default()
-    words = topic.split()
-    lines = []
-    cur = []
-    max_chars = 22
-    for w in words:
-        test = " ".join(cur + [w])
         if len(test) > max_chars:
-            lines.append(" ".join(cur))
-            cur = [w]
         else:
-            cur.append(w)
-    if cur:
-        lines.append(" ".join(cur))
     total_h = 0
-    for ln in lines:
         bbox = draw.textbbox((0, 0), ln, font=font)
-        total_h += (bbox[3] - bbox[1]) + 8
     y = (height - total_h) // 2
-    for ln in lines:
         bbox = draw.textbbox((0, 0), ln, font=font)
         w = bbox[2] - bbox[0]
         x = (width - w) // 2
         draw.text((x, y), ln, fill=PLACEHOLDER_FG, font=font)
-        y += (bbox[3] - bbox[1]) + 8
     out_path = f"placeholder_{uid()}.png"
     img.save(out_path)
     return out_path
 def research_topic(topic: str) -> str:
     try:
         results = tavily_client.search(
             query=f"Key facts and interesting points about {topic}",
             search_depth="basic"
         )
         if results and "results" in results:
-            return "
-".join(
-                str(r.get("content", "")).strip() for r in results["results"] if r.get("content")
             )
     except Exception as e:
         log.warning(f"Tavily failed: {e}")
     return "No supplemental research facts available."
 def gemini_script(topic: str, facts: str, scene_count: int) -> Dict[str, Any]:
     prompt = f"""
 You are a creative director for short-form educational / promotional videos.
@@ -140,117 +142,151 @@ Topic: {topic}
 Supplemental Facts:
 {facts}
-Produce STRICT JSON with keys:
-  "narration_script": string
-  "scene_prompts": list[{scene_count}] of cinematic prompts (<=40 words each), no numbering.
-Each scene prompt MUST specify a consistent main subject, camera/movement, and lighting/mood.
-JSON ONLY, no markdown fences.
 """
     model = genai.GenerativeModel("gemini-1.5-flash")
     response = model.generate_content(prompt)
     raw = (response.text or "").strip()
     if raw.startswith("```"):
-        raw = raw.strip("`").lstrip("json").strip()
     data = None
     try:
         data = json.loads(raw)
     except json.JSONDecodeError:
-        start, end = raw.find("{"), raw.rfind("}")
         if start != -1 and end != -1:
             try:
                 data = json.loads(raw[start:end + 1])
             except Exception:
                 pass
     if not isinstance(data, dict):
-        raise gr.Error("Gemini did not return valid JSON structure.")
     narration = data.get("narration_script")
     scenes = data.get("scene_prompts")
     if isinstance(narration, list):
         narration = " ".join(map(str, narration))
     if not isinstance(narration, str) or not narration.strip():
         raise gr.Error("Invalid narration_script returned.")
     narration = narration.strip()
     if not isinstance(scenes, list):
-        raise gr.Error("scene_prompts is not a list.")
     scenes = [str(s).strip() for s in scenes if str(s).strip()]
     if len(scenes) != scene_count:
         while len(scenes) < scene_count:
-            scenes.append(scenes[-1] if scenes else f"Establishing cinematic shot about {topic}")
         scenes = scenes[:scene_count]
-    return {"narration": narration, "scenes": scenes}
-def ensure_duration(narration: str) -> float:
-    return max(2.0, min(300.0, len(narration.split()) / WORDS_PER_SEC))
-def mock_audio(narration: str, out_path: str) -> float:
-    duration = ensure_duration(narration)
-    subprocess.run([
-        "ffmpeg", "-f", "lavfi", "-i", "anullsrc=r=44100:cl=mono",
-        "-t", f"{duration:.2f}", "-q:a", "9", "-acodec", "libmp3lame", out_path, "-y"
-    ], check=True)
-    return duration
-def elevenlabs_tts(narration: str, voice_id: str, out_path: str, model: str, optimize_streaming_latency: int, use_stream: bool) -> float:
-    if not ELEVEN_AVAILABLE:
-        raise gr.Error("ElevenLabs API key not configured.")
-    # Streaming or non-streaming generation
-    if use_stream:
-        # Streaming: write chunks as they arrive
-        with open(out_path, "wb") as f:
-            for chunk in eleven_client.text_to_speech.convert(
-                voice_id=voice_id,
-                optimize_streaming_latency=optimize_streaming_latency,
-                model_id=model,
-                output_format="mp3_44100_128",
-                text=narration,
-                voice_settings=VoiceSettings(
-                    stability=0.5,
-                    similarity_boost=0.8,
-                    style=0.3,
-                    use_speaker_boost=True,
-                ),
-                stream=True,
-            ):
-                if isinstance(chunk, bytes):
-                    f.write(chunk)
-    else:
-        audio = eleven_client.text_to_speech.convert(
-            voice_id=voice_id,
-            model_id=model,
-            output_format="mp3_44100_128",
-            text=narration,
-            voice_settings=VoiceSettings(
-                stability=0.5,
-                similarity_boost=0.8,
-                style=0.3,
-                use_speaker_boost=True,
-            ),
-        )
-        with open(out_path, "wb") as f:
-            f.write(audio)
-    # Roughly compute duration from word count; could probe with ffprobe for exact.
-    return ensure_duration(narration)
 def list_elevenlabs_voices() -> List[Dict[str, str]]:
-    if not ELEVEN_AVAILABLE:
         return []
     try:
         voices = eleven_client.voices.get_all()
-        out = []
         for v in voices.voices:
-            out.append({"id": v.voice_id, "name": v.name})
-        return out
     except Exception as e:
-        log.warning(f"Failed to list voices: {e}")
         return []
-def build_prompt_image_data_uri(image_path: str) -> str:
-    import base64
-    with open(image_path, "rb") as f:
-        b64 = base64.b64encode(f.read()).decode("utf-8")
-    return f"data:image/png;base64,{b64}"
 def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, ratio: str) -> str:
     try:
         task = runway_client.image_to_video.create(
             model="gen4_turbo",
@@ -282,9 +318,8 @@ def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, rat
         raise gr.Error("Runway returned no outputs.")
     video_url = outputs[0]
-    import httpx
     clip_path = f"runway_clip_{uid()}.mp4"
-    with httpx.stream("GET", video_url, timeout=120) as resp:
         resp.raise_for_status()
         with open(clip_path, "wb") as f:
             for chunk in resp.iter_bytes():
@@ -295,40 +330,41 @@ def concat_and_mux(video_paths: List[str], audio_path: str, out_path: str) -> No
     list_file = f"concat_{uid()}.txt"
     with open(list_file, "w") as lf:
         for p in video_paths:
-            lf.write(f"file '{p}'
-")
     temp_concat = f"combined_{uid()}.mp4"
     subprocess.run([
-        "ffmpeg", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", temp_concat, "-y"
     ], check=True)
     subprocess.run([
-        "ffmpeg", "-i", temp_concat, "-i", audio_path, "-c:v", "copy", "-c:a", "aac", "-shortest", out_path, "-y"
     ], check=True)
     for p in (list_file, temp_concat):
-        try:
-            os.remove(p)
-        except OSError:
-            pass
-def enhance_scene_prompt(base: str) -> str:
-    return f"{base}. {GLOBAL_STYLE}"
-# ---------------- Main Generation Function ----------------
 def generate_video_from_topic(
     topic: str,
-    keyframe_image: Optional[str],
     scene_count: int,
     clip_duration: int,
     ratio: str,
-    use_eleven: bool,
-    eleven_voice: str,
-    eleven_model: str,
-    streaming: bool,
-    optimize_latency: int,
     progress=gr.Progress(track_tqdm=True)
 ) -> str:
     job = uid()
-    log.info(f"[AI-STUDIO] Starting job {job} :: topic='{topic}'")
     temp_files: List[str] = []
     try:
         if not topic or not topic.strip():
@@ -345,65 +381,79 @@ def generate_video_from_topic(
         narration = script["narration"]
         scenes = script["scenes"]
-        progress(0.30, desc="🎙️ Generating narration audio...")
-        audio_path = f"audio_{job}.mp3"
         temp_files.append(audio_path)
-        if use_eleven and ELEVEN_AVAILABLE:
-            elevenlabs_tts(
-                narration=narration,
-                voice_id=eleven_voice,
-                out_path=audio_path,
-                model=eleven_model,
-                optimize_streaming_latency=optimize_latency,
-                use_stream=streaming,
             )
-        else:
-            mock_audio(narration, audio_path)
-        progress(0.40, desc="🖼️ Preparing keyframe image...")
-        if keyframe_image:
-            prompt_image_path = keyframe_image
         else:
             prompt_image_path = generate_placeholder_image(topic)
             temp_files.append(prompt_image_path)
-        prompt_image_data_uri = build_prompt_image_data_uri(prompt_image_path)
         video_clips: List[str] = []
         for idx, base_prompt in enumerate(scenes, start=1):
-            progress(0.40 + (0.45 * idx / scene_count), desc=f"🎬 Generating scene {idx}/{scene_count}...")
-            full_prompt = enhance_scene_prompt(base_prompt)
             try:
                 clip_path = runway_generate_clip(
-                    prompt_image=prompt_image_data_uri,
                     text_prompt=full_prompt,
                     duration=clip_duration,
                     ratio=ratio
                 )
             except Exception as e:
-                log.error(f"Scene {idx} failed: {e}; retrying once with refined prompt")
-                retry_prompt = full_prompt + " -- refined detail, consistent style"
                 clip_path = runway_generate_clip(
-                    prompt_image=prompt_image_data_uri,
                     text_prompt=retry_prompt,
                     duration=clip_duration,
                     ratio=ratio
                 )
-            video_clips.append(clip_path)
-            temp_files.append(clip_path)
         progress(0.92, desc="🧵 Stitching scenes...")
         final_out = f"{sanitize_filename(topic)}_{job}.mp4"
         concat_and_mux(video_clips, audio_path, final_out)
         progress(1.0, desc="✅ Done!")
-        log.info(f"[AI-STUDIO] Job {job} completed -> {final_out}")
         return final_out
     except Exception as e:
-        log.error(f"[AI-STUDIO] JOB {job} FAILED: {e}", exc_info=True)
         raise gr.Error(f"An error occurred: {e}")
     finally:
-        # Clean up intermediate (keep keyframe if user uploaded it)
         for p in temp_files:
             try:
                 if os.path.exists(p):
@@ -411,75 +461,71 @@ def generate_video_from_topic(
             except OSError:
                 pass
-# ---------------- Voice Helper for UI ----------------
-def get_voice_choices() -> List[str]:
     voices = list_elevenlabs_voices()
-    if not voices:
-        return ["eleven_monolingual_v1"]  # fallback placeholder id name pattern
-    return [f"{v['name']}|{v['id']}" for v in voices]
-VOICE_CHOICES = get_voice_choices()
-DEFAULT_VOICE = VOICE_CHOICES[0] if VOICE_CHOICES else "Rachel|21m00Tcm4TlvDq8ikWAM"  # Example default voice id pattern
 # ---------------- Gradio UI ----------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🎬 AI Video Studio (Runway Gen-4 Turbo + Gemini + ElevenLabs)")
     gr.Markdown(
-        "Generate a multi-scene AI video: research → script → voiceover (mock or ElevenLabs) → Gen-4 Turbo clips → stitch."
     )
     with gr.Row():
         topic = gr.Textbox(label="Video Topic", placeholder="e.g., The history of coffee", scale=3)
         keyframe = gr.Image(type="filepath", label="Optional Keyframe (Image)", scale=2)
-    with gr.Accordion("Narration Settings (ElevenLabs)", open=False):
-        use_eleven = gr.Checkbox(value=ELEVEN_AVAILABLE, label="Use ElevenLabs (falls back to mock if unchecked or unavailable)")
-        voice_select = gr.Dropdown(choices=VOICE_CHOICES, value=DEFAULT_VOICE, label="Voice (Name|ID)")
-        eleven_model = gr.Textbox(value="eleven_turbo_v2_5", label="ElevenLabs Model ID")
-        streaming = gr.Checkbox(value=True, label="Stream TTS (lower latency)")
-        optimize_latency = gr.Slider(0, 4, value=0, step=1, label="Optimize Streaming Latency (0=off, higher=more aggressive)")
     with gr.Row():
         scene_count = gr.Slider(1, MAX_SCENES, value=DEFAULT_SCENES, step=1, label="Number of Scenes")
         duration = gr.Radio(choices=sorted(list(ALLOWED_DURATIONS)), value=5, label="Seconds per Scene")
-        ratio = gr.Dropdown(choices=["1280:720", "1920:1080", "1080:1920", "1024:1024"], value="1280:720", label="Aspect Ratio")
     generate_btn = gr.Button("🚀 Generate Video", variant="primary")
     output_video = gr.Video(label="Final Video")
-    def _parse_voice(v: str) -> str:
-        if "|" in v:
-            return v.split("|", 1)[1]
-        return v
-    def wrapper(topic, keyframe, scene_count, duration, ratio, use_eleven, voice_combo, eleven_model, streaming, optimize_latency):
-        voice_id = _parse_voice(voice_combo)
-        return generate_video_from_topic(
-            topic=topic,
-            keyframe_image=keyframe,
-            scene_count=scene_count,
-            clip_duration=int(duration),
-            ratio=ratio,
-            use_eleven=use_eleven,
-            eleven_voice=voice_id,
-            eleven_model=eleven_model.strip() or "eleven_turbo_v2_5",
-            streaming=streaming,
-            optimize_latency=int(optimize_latency),
-        )
     generate_btn.click(
-        fn=wrapper,
-        inputs=[topic, keyframe, scene_count, duration, ratio, use_eleven, voice_select, eleven_model, streaming, optimize_latency],
         outputs=output_video
     )
-    gr.Markdown("""---
-### Tips
-- Upload a keyframe to increase subject continuity.
-- Refine prompts by editing the generated scene prompts logic (extend code for manual review step).
-- ElevenLabs: if you get 401 errors, verify the API key and voice ID. For new voices, refresh the Space (reload) to repopulate the list.
-- Use 5s scenes for faster iteration; switch to 10s for final renders.
-""")
 if __name__ == "__main__":
     demo.launch()

 from PIL import Image, ImageDraw, ImageFont
 # External SDKs
+import google.generativeai as genai          # Gemini
+from tavily import TavilyClient              # Research enrichment
+from runwayml import RunwayML, TaskFailedError  # Runway SDK
+from elevenlabs import ElevenLabs, APIError  # ElevenLabs TTS (pip install elevenlabs)
+import httpx
+import base64
+# ---------------- Logging ----------------
 logging.basicConfig(
     level=logging.INFO,
     format="[%(levelname)s %(asctime)s] %(message)s",
 )
 log = logging.getLogger("ai_video_studio")
+# ---------------- Configuration / Keys ----------------
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 RUNWAY_KEY = os.getenv("RUNWAY_API_KEY") or os.getenv("RUNWAYML_API_SECRET")
+ELEVEN_KEY = os.getenv("ELEVENLABS_API_KEY") or os.getenv("XI_API_KEY")
+missing = [k for k, v in {
     "GEMINI_API_KEY": GEMINI_API_KEY,
     "TAVILY_API_KEY": TAVILY_API_KEY,
+    "RUNWAY_API_KEY": RUNWAY_KEY
+}.items() if not v]
 if missing:
     raise RuntimeError(f"Missing required API keys: {', '.join(missing)}")
 genai.configure(api_key=GEMINI_API_KEY)
 tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
 runway_client = RunwayML(api_key=RUNWAY_KEY)
+eleven_client: Optional[ElevenLabs] = None
+if ELEVEN_KEY:
+    eleven_client = ElevenLabs(api_key=ELEVEN_KEY)
 # ---------------- Constants ----------------
 DEFAULT_SCENES = 4
 MAX_SCENES = 8
+ALLOWED_DURATIONS = {5, 10}          # Runway Gen-4 supported lengths (seconds)
+WORDS_PER_SEC = 2.5                  # Heuristic for mock track
 PLACEHOLDER_BG = (18, 18, 22)
 PLACEHOLDER_FG = (239, 239, 245)
 FONT_CANDIDATES = [
     "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
     "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
 ]
+# ---------------- Utility ----------------
 def uid() -> str:
     return f"{int(time.time())}_{random.randint(1000, 9999)}"
 def sanitize_filename(name: str) -> str:
+    safe = "".join(c for c in name if c.isalnum() or c in ("-", "_"))[:60]
     return safe or "video"
 def generate_placeholder_image(topic: str, width: int = 768, height: int = 432) -> str:
+    """Create a simple PNG keyframe if user didn't upload one."""
     img = Image.new("RGB", (width, height), PLACEHOLDER_BG)
     draw = ImageDraw.Draw(img)
     font = None
     for path in FONT_CANDIDATES:
         if Path(path).exists():
             try:
+                font = ImageFont.truetype(path, 42)
                 break
             except Exception:
                 pass
     if font is None:
         font = ImageFont.load_default()
+    max_chars = 24
+    wrapped: List[str] = []
+    line: List[str] = []
+    for w in topic.split():
+        test = " ".join(line + [w])
         if len(test) > max_chars:
+            wrapped.append(" ".join(line))
+            line = [w]
         else:
+            line.append(w)
+    if line:
+        wrapped.append(" ".join(line))
     total_h = 0
+    line_metrics = []
+    for ln in wrapped:
         bbox = draw.textbbox((0, 0), ln, font=font)
+        h = bbox[3] - bbox[1]
+        line_metrics.append((ln, h))
+        total_h += h + 10
     y = (height - total_h) // 2
+    for ln, h in line_metrics:
         bbox = draw.textbbox((0, 0), ln, font=font)
         w = bbox[2] - bbox[0]
         x = (width - w) // 2
         draw.text((x, y), ln, fill=PLACEHOLDER_FG, font=font)
+        y += h + 10
     out_path = f"placeholder_{uid()}.png"
     img.save(out_path)
     return out_path
 def research_topic(topic: str) -> str:
+    """Fetch supplemental facts; return safe fallback if API fails."""
     try:
         results = tavily_client.search(
             query=f"Key facts and interesting points about {topic}",
             search_depth="basic"
         )
         if results and "results" in results:
+            return "\n".join(
+                str(r.get("content", "")).strip()
+                for r in results["results"]
+                if r.get("content")
             )
     except Exception as e:
         log.warning(f"Tavily failed: {e}")
     return "No supplemental research facts available."
 def gemini_script(topic: str, facts: str, scene_count: int) -> Dict[str, Any]:
+    """Obtain narration + scene prompts as structured JSON from Gemini."""
     prompt = f"""
 You are a creative director for short-form educational / promotional videos.
 Supplemental Facts:
 {facts}
+Return STRICT JSON:
+{{
+  "narration_script": "<single cohesive narration>",
+  "scene_prompts": ["<scene 1>", ... (exactly {scene_count} total) ]
+}}
+Scene prompt requirements:
+- <= 40 words
+- Consistent main subject
+- Include camera/movement term (e.g. "slow dolly in", "aerial sweep")
+- Mention lighting/mood
+NO markdown, NO extra commentary.
 """
     model = genai.GenerativeModel("gemini-1.5-flash")
     response = model.generate_content(prompt)
     raw = (response.text or "").strip()
     if raw.startswith("```"):
+        # strip code fences if present
+        raw = raw.strip("`")
+        if raw.lower().startswith("json"):
+            raw = raw[4:].strip()
     data = None
     try:
         data = json.loads(raw)
     except json.JSONDecodeError:
+        start = raw.find("{")
+        end = raw.rfind("}")
         if start != -1 and end != -1:
             try:
                 data = json.loads(raw[start:end + 1])
             except Exception:
                 pass
     if not isinstance(data, dict):
+        raise gr.Error("Gemini did not return valid JSON.")
     narration = data.get("narration_script")
     scenes = data.get("scene_prompts")
     if isinstance(narration, list):
         narration = " ".join(map(str, narration))
     if not isinstance(narration, str) or not narration.strip():
         raise gr.Error("Invalid narration_script returned.")
     narration = narration.strip()
     if not isinstance(scenes, list):
+        raise gr.Error("scene_prompts missing or not a list.")
     scenes = [str(s).strip() for s in scenes if str(s).strip()]
     if len(scenes) != scene_count:
+        # normalize length
         while len(scenes) < scene_count:
+            scenes.append(f"Dynamic cinematic shot about {topic}")
         scenes = scenes[:scene_count]
+    return {"narration": narration, "scenes": scenes}
+# ---------------- ElevenLabs Integration ----------------
 def list_elevenlabs_voices() -> List[Dict[str, str]]:
+    """Fetch voices (name + id) if ElevenLabs key available."""
+    if not eleven_client:
         return []
     try:
+        # The SDK's voices list method (internally hits the list voices endpoint)
         voices = eleven_client.voices.get_all()
+        # Normalize to simple dict
+        simplified = []
         for v in voices.voices:
+            simplified.append({"id": v.voice_id, "name": v.name})
+        return simplified
     except Exception as e:
+        log.warning(f"Failed to list ElevenLabs voices: {e}")
         return []
+def synthesize_narration_elevenlabs(
+    text: str,
+    voice_id: str,
+    model_id: str,
+    stability: float,
+    similarity: float,
+    style: float,
+    speaker_boost: bool,
+    streaming: bool,
+    out_path: str
+) -> bool:
+    """Return True on success; False triggers fallback."""
+    if not eleven_client:
+        return False
+    try:
+        # Bound parameters
+        stability = max(0.0, min(1.0, stability))
+        similarity = max(0.0, min(1.0, similarity))
+        style = max(0.0, min(1.0, style))
+        if streaming:
+            # Streaming synthesis (chunked)
+            with open(out_path, "wb") as f:
+                for chunk in eleven_client.text_to_speech.convert_as_stream(
+                    voice_id=voice_id,
+                    model_id=model_id,
+                    text=text,
+                    optimize_streaming_latency=3,
+                    voice_settings={
+                        "stability": stability,
+                        "similarity_boost": similarity,
+                        "style": style,
+                        "use_speaker_boost": speaker_boost
+                    }
+                ):
+                    f.write(chunk)
+        else:
+            # Standard synthesis (single request)
+            audio = eleven_client.text_to_speech.convert(
+                voice_id=voice_id,
+                model_id=model_id,
+                text=text,
+                voice_settings={
+                    "stability": stability,
+                    "similarity_boost": similarity,
+                    "style": style,
+                    "use_speaker_boost": speaker_boost
+                }
+            )
+            with open(out_path, "wb") as f:
+                f.write(audio)
+        return True
+    except APIError as e:
+        log.error(f"ElevenLabs API error: {e}")
+    except Exception as e:
+        log.error(f"ElevenLabs synthesis failed: {e}")
+    return False
+def generate_mock_voiceover(narration: str, out_path: str) -> float:
+    """Silent track matching approximate narration length (fallback)."""
+    duration = max(2.0, min(300.0, len(narration.split()) / WORDS_PER_SEC))
+    subprocess.run([
+        "ffmpeg", "-f", "lavfi", "-i", "anullsrc=r=44100:cl=mono",
+        "-t", f"{duration:.2f}", "-q:a", "9", "-acodec", "libmp3lame",
+        out_path, "-y"
+    ], check=True)
+    return duration
+# ---------------- Runway Integration ----------------
 def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, ratio: str) -> str:
+    """Create image_to_video task and download resulting MP4."""
     try:
         task = runway_client.image_to_video.create(
             model="gen4_turbo",
         raise gr.Error("Runway returned no outputs.")
     video_url = outputs[0]
     clip_path = f"runway_clip_{uid()}.mp4"
+    with httpx.stream("GET", video_url, timeout=180) as resp:
         resp.raise_for_status()
         with open(clip_path, "wb") as f:
             for chunk in resp.iter_bytes():
     list_file = f"concat_{uid()}.txt"
     with open(list_file, "w") as lf:
         for p in video_paths:
+            lf.write(f"file '{p}'\n")
     temp_concat = f"combined_{uid()}.mp4"
     subprocess.run([
+        "ffmpeg", "-f", "concat", "-safe", "0", "-i", list_file,
+        "-c", "copy", temp_concat, "-y"
     ], check=True)
     subprocess.run([
+        "ffmpeg", "-i", temp_concat, "-i", audio_path,
+        "-c:v", "copy", "-c:a", "aac", "-shortest", out_path, "-y"
     ], check=True)
     for p in (list_file, temp_concat):
+        try: os.remove(p)
+        except OSError: pass
+def enhance_scene_prompt(base: str, global_style: str) -> str:
+    return f"{base}. {global_style}"
+# ---------------- Core Generation ----------------
 def generate_video_from_topic(
     topic: str,
+    uploaded_keyframe: Optional[str],
     scene_count: int,
     clip_duration: int,
     ratio: str,
+    voice_id: str,
+    model_id: str,
+    stability: float,
+    similarity: float,
+    style: float,
+    speaker_boost: bool,
+    use_streaming_tts: bool,
     progress=gr.Progress(track_tqdm=True)
 ) -> str:
     job = uid()
+    log.info(f"[AI-STUDIO] Start job {job} topic='{topic}'")
     temp_files: List[str] = []
     try:
         if not topic or not topic.strip():
         narration = script["narration"]
         scenes = script["scenes"]
+        progress(0.30, desc="🎙️ Generating narration...")
+        audio_path = f"narration_{job}.mp3"
         temp_files.append(audio_path)
+        tts_success = False
+        if ELEVEN_KEY and voice_id and model_id:
+            tts_success = synthesize_narration_elevenlabs(
+                text=narration,
+                voice_id=voice_id,
+                model_id=model_id,
+                stability=stability,
+                similarity=similarity,
+                style=style,
+                speaker_boost=speaker_boost,
+                streaming=use_streaming_tts,
+                out_path=audio_path
             )
+        if not tts_success:
+            log.warning("Using mock silent track (ElevenLabs unavailable or failed).")
+            generate_mock_voiceover(narration, audio_path)
+        progress(0.40, desc="🖼️ Preparing keyframe...")
+        if uploaded_keyframe:
+            prompt_image_path = uploaded_keyframe
         else:
             prompt_image_path = generate_placeholder_image(topic)
             temp_files.append(prompt_image_path)
+        with open(prompt_image_path, "rb") as f:
+            b64 = base64.b64encode(f.read()).decode("utf-8")
+            prompt_image = f"data:image/png;base64,{b64}"
+        global_style = "Cinematic, natural volumetric light, subtle camera motion, cohesive style, high detail"
         video_clips: List[str] = []
         for idx, base_prompt in enumerate(scenes, start=1):
+            progress(0.40 + 0.45 * idx / scene_count,
+                     desc=f"🎬 Generating scene {idx}/{scene_count}...")
+            full_prompt = enhance_scene_prompt(base_prompt, global_style)
             try:
                 clip_path = runway_generate_clip(
+                    prompt_image=prompt_image,
                     text_prompt=full_prompt,
                     duration=clip_duration,
                     ratio=ratio
                 )
+                video_clips.append(clip_path)
+                temp_files.append(clip_path)
             except Exception as e:
+                log.error(f"Scene {idx} failed: {e}")
+                retry_prompt = full_prompt + " -- consistent subject, refined detail"
                 clip_path = runway_generate_clip(
+                    prompt_image=prompt_image,
                     text_prompt=retry_prompt,
                     duration=clip_duration,
                     ratio=ratio
                 )
+                video_clips.append(clip_path)
+                temp_files.append(clip_path)
         progress(0.92, desc="🧵 Stitching scenes...")
         final_out = f"{sanitize_filename(topic)}_{job}.mp4"
         concat_and_mux(video_clips, audio_path, final_out)
         progress(1.0, desc="✅ Done!")
+        log.info(f"[AI-STUDIO] Job {job} complete -> {final_out}")
         return final_out
     except Exception as e:
+        log.error(f"[AI-STUDIO] Job {job} FAILED: {e}", exc_info=True)
         raise gr.Error(f"An error occurred: {e}")
     finally:
+        # Clean temp artifacts (not final video)
         for p in temp_files:
             try:
                 if os.path.exists(p):
             except OSError:
                 pass
+# ---------------- Helper for Voice Dropdown ----------------
+def refresh_voices() -> List[str]:
     voices = list_elevenlabs_voices()
+    return [f"{v['name']}|{v['id']}" for v in voices] if voices else []
 # ---------------- Gradio UI ----------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🎬 AI Video Studio (Runway Gen-4 Turbo + Gemini + ElevenLabs)")
     gr.Markdown(
+        "Provide a topic (and optional keyframe). We’ll research, script, generate multi-scene video, "
+        "synthesize narration, and assemble the final clip."
     )
     with gr.Row():
         topic = gr.Textbox(label="Video Topic", placeholder="e.g., The history of coffee", scale=3)
         keyframe = gr.Image(type="filepath", label="Optional Keyframe (Image)", scale=2)
     with gr.Row():
         scene_count = gr.Slider(1, MAX_SCENES, value=DEFAULT_SCENES, step=1, label="Number of Scenes")
         duration = gr.Radio(choices=sorted(list(ALLOWED_DURATIONS)), value=5, label="Seconds per Scene")
+        ratio = gr.Dropdown(choices=["1280:720", "1920:1080", "1080:1920", "1024:1024"],
+                            value="1280:720", label="Aspect Ratio")
+    gr.Markdown("### Narration (ElevenLabs)")
+    with gr.Row():
+        refresh_btn = gr.Button("🔄 Refresh Voices", variant="secondary")
+        voices_dd = gr.Dropdown(choices=[], label="Voice (Name|ID)", value=None)
+        model_dd = gr.Dropdown(
+            choices=[
+                "eleven_multilingual_v2", "eleven_turbo_v2_5",
+                "eleven_flash_v2_5", "eleven_monolingual_v1"
+            ],
+            value="eleven_turbo_v2_5",
+            label="ElevenLabs Model"
+        )
+        streaming_chk = gr.Checkbox(label="Streaming TTS", value=False)
+    with gr.Row():
+        stability = gr.Slider(0, 1, value=0.55, step=0.01, label="Stability")
+        similarity = gr.Slider(0, 1, value=0.80, step=0.01, label="Similarity Boost")
+        style = gr.Slider(0, 1, value=0.20, step=0.01, label="Style")
+        speaker_boost = gr.Checkbox(label="Speaker Boost", value=True)
     generate_btn = gr.Button("🚀 Generate Video", variant="primary")
     output_video = gr.Video(label="Final Video")
+    def _do_refresh():
+        return gr.update(choices=refresh_voices())
+    refresh_btn.click(fn=_do_refresh, outputs=voices_dd)
     generate_btn.click(
+        fn=generate_video_from_topic,
+        inputs=[
+            topic, keyframe, scene_count, duration, ratio,
+            voices_dd, model_dd, stability, similarity, style,
+            speaker_boost, streaming_chk
+        ],
         outputs=output_video
     )
+    gr.Markdown("### Tips\n"
+                "- Provide a strong keyframe for better temporal coherence.\n"
+                "- Refine scene prompts by adjusting topic wording if motion feels generic.\n"
+                "- Tweak Stability and Similarity to balance expressiveness vs consistency.")
 if __name__ == "__main__":
     demo.launch()