Spaces:

mgbam
/

my-video-app

Sleeping

App Files Files Community

mgbam commited on 30 days ago

Commit

0c28ab5

verified ·

1 Parent(s): 31397a9

Update app.py

Browse files

Files changed (1) hide show

app.py +213 -157

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import io
 import json
 import time
 import random
-import string
 import logging
 import subprocess
 from pathlib import Path
@@ -12,10 +11,11 @@ from typing import List, Dict, Any, Optional
 import gradio as gr
 from PIL import Image, ImageDraw, ImageFont
-# --- External SDKs ---
-import google.generativeai as genai          # Gemini (google-generativeai)
-from tavily import TavilyClient              # Research enrichment
-from runwayml import RunwayML, TaskFailedError  # Official Runway SDK
 # ---------------- Logging Setup ----------------
 logging.basicConfig(
@@ -25,86 +25,92 @@ logging.basicConfig(
 )
 log = logging.getLogger("ai_video_studio")
-# ---------------- Configuration ----------------
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
-# Allow either variable name for Runway:
 RUNWAY_KEY = os.getenv("RUNWAY_API_KEY") or os.getenv("RUNWAYML_API_SECRET")
-if not (GEMINI_API_KEY and TAVILY_API_KEY and RUNWAY_KEY):
-    missing = [k for k, v in {
-        "GEMINI_API_KEY": GEMINI_API_KEY,
-        "TAVILY_API_KEY": TAVILY_API_KEY,
-        "RUNWAY_API_KEY or RUNWAYML_API_SECRET": RUNWAY_KEY
-    }.items() if not v]
     raise RuntimeError(f"Missing required API keys: {', '.join(missing)}")
 genai.configure(api_key=GEMINI_API_KEY)
 tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
 runway_client = RunwayML(api_key=RUNWAY_KEY)
 # ---------------- Constants ----------------
 DEFAULT_SCENES = 4
-WORDS_PER_SEC = 2.5  # heuristic for mock VO
 MAX_SCENES = 8
-ALLOWED_DURATIONS = {5, 10}  # Gen-4 supported clip lengths
 PLACEHOLDER_BG = (18, 18, 22)
 PLACEHOLDER_FG = (239, 239, 245)
 FONT_CANDIDATES = [
     "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
     "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
 ]
 # ---------------- Utility Functions ----------------
 def uid() -> str:
     return f"{int(time.time())}_{random.randint(1000, 9999)}"
 def sanitize_filename(name: str) -> str:
-    safe = "".join(c for c in name if c.isalnum() or c in ("-", "_"))[:50]
     return safe or "video"
 def generate_placeholder_image(topic: str, width: int = 768, height: int = 432) -> str:
-    """
-    Creates a simple placeholder keyframe with the topic text.
-    Returns path to the PNG.
-    """
     img = Image.new("RGB", (width, height), PLACEHOLDER_BG)
     draw = ImageDraw.Draw(img)
     font = None
     for path in FONT_CANDIDATES:
         if Path(path).exists():
             try:
-                font = ImageFont.truetype(path, 42)
                 break
             except Exception:
                 pass
     if font is None:
         font = ImageFont.load_default()
-    wrapped = []
     words = topic.split()
-    line = []
-    max_chars = 24
     for w in words:
-        test = " ".join(line + [w])
         if len(test) > max_chars:
-            wrapped.append(" ".join(line))
-            line = [w]
         else:
-            line.append(w)
-    if line:
-        wrapped.append(" ".join(line))
-    total_h = sum(draw.textbbox((0, 0), ln, font=font)[3] - draw.textbbox((0, 0), ln, font=font)[1] + 10
-                  for ln in wrapped)
     y = (height - total_h) // 2
-    for ln in wrapped:
         bbox = draw.textbbox((0, 0), ln, font=font)
         w = bbox[2] - bbox[0]
         x = (width - w) // 2
         draw.text((x, y), ln, fill=PLACEHOLDER_FG, font=font)
-        y += (bbox[3] - bbox[1]) + 10
     out_path = f"placeholder_{uid()}.png"
     img.save(out_path)
@@ -117,21 +123,15 @@ def research_topic(topic: str) -> str:
             search_depth="basic"
         )
         if results and "results" in results:
-            return "\n".join(
-                str(r.get("content", "")).strip()
-                for r in results["results"]
-                if r.get("content")
             )
     except Exception as e:
         log.warning(f"Tavily failed: {e}")
     return "No supplemental research facts available."
 def gemini_script(topic: str, facts: str, scene_count: int) -> Dict[str, Any]:
-    """
-    Ask Gemini for structured JSON (narration + scene prompts).
-    Includes fallback parsing if schema drifts.
-    """
-    # Base prompt (schema hint)
     prompt = f"""
 You are a creative director for short-form educational / promotional videos.
@@ -140,36 +140,22 @@ Topic: {topic}
 Supplemental Facts:
 {facts}
-Produce STRICT JSON with:
-  "narration_script": string   # a cohesive narration referencing key facts succinctly
-  "scene_prompts": list[{scene_count}]  # exactly {scene_count} cinematic, image-to-video prompts.
-Each scene prompt MUST:
- - Specify a consistent main subject (if applicable).
- - Include a camera or movement descriptor (e.g. "slow dolly in", "aerial shot", "handheld").
- - Mention lighting or mood.
- - Be <= 40 words, no leading numbering.
-JSON ONLY. No markdown fences.
 """
     model = genai.GenerativeModel("gemini-1.5-flash")
     response = model.generate_content(prompt)
     raw = (response.text or "").strip()
-    # Fallback: remove code fences if present
     if raw.startswith("```"):
-        raw = raw.strip("`")
-        # remove potential language spec lines
-        if raw.lower().startswith("json"):
-            raw = raw[4:].strip()
-    # Attempt direct parse
     data = None
     try:
         data = json.loads(raw)
     except json.JSONDecodeError:
-        # Try to extract first {...} block heuristically
-        start = raw.find("{")
-        end = raw.rfind("}")
         if start != -1 and end != -1:
             try:
                 data = json.loads(raw[start:end + 1])
@@ -177,45 +163,94 @@ JSON ONLY. No markdown fences.
                 pass
     if not isinstance(data, dict):
         raise gr.Error("Gemini did not return valid JSON structure.")
     narration = data.get("narration_script")
     scenes = data.get("scene_prompts")
-    # Normalize narration
     if isinstance(narration, list):
         narration = " ".join(map(str, narration))
     if not isinstance(narration, str) or not narration.strip():
         raise gr.Error("Invalid narration_script returned.")
     narration = narration.strip()
-    # Normalize scenes
     if not isinstance(scenes, list):
         raise gr.Error("scene_prompts is not a list.")
     scenes = [str(s).strip() for s in scenes if str(s).strip()]
     if len(scenes) != scene_count:
-        # If mismatch, truncate or pad with variants
         while len(scenes) < scene_count:
-            scenes.append(scenes[-1] if scenes else f"Cinematic establishing shot about {topic}")
         scenes = scenes[:scene_count]
     return {"narration": narration, "scenes": scenes}
-def generate_mock_voiceover(narration: str, out_path: str) -> float:
-    """
-    Create silent (mock) audio track sized to narration length.
-    """
-    duration = max(2.0, min(300.0, len(narration.split()) / WORDS_PER_SEC))
     subprocess.run([
         "ffmpeg", "-f", "lavfi", "-i", "anullsrc=r=44100:cl=mono",
-        "-t", f"{duration:.2f}", "-q:a", "9", "-acodec", "libmp3lame",
-        out_path, "-y"
     ], check=True)
     return duration
 def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, ratio: str) -> str:
-    """
-    Launch an image_to_video task and return the downloaded file path.
-    """
     try:
         task = runway_client.image_to_video.create(
             model="gen4_turbo",
@@ -227,7 +262,6 @@ def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, rat
     except Exception as e:
         raise gr.Error(f"Failed to create Runway task: {e}")
-    # Poll until completion
     max_wait = 300
     interval = 5
     waited = 0
@@ -246,10 +280,8 @@ def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, rat
     outputs = getattr(task, "output", None)
     if not outputs or not isinstance(outputs, list):
         raise gr.Error("Runway returned no outputs.")
     video_url = outputs[0]
-    # Download
     import httpx
     clip_path = f"runway_clip_{uid()}.mp4"
     with httpx.stream("GET", video_url, timeout=120) as resp:
@@ -260,46 +292,39 @@ def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, rat
     return clip_path
 def concat_and_mux(video_paths: List[str], audio_path: str, out_path: str) -> None:
-    """
-    Concatenate MP4 clips (same codec) and mux with audio.
-    """
-    # Create concat file
     list_file = f"concat_{uid()}.txt"
     with open(list_file, "w") as lf:
         for p in video_paths:
-            lf.write(f"file '{p}'\n")
     temp_concat = f"combined_{uid()}.mp4"
     subprocess.run([
-        "ffmpeg", "-f", "concat", "-safe", "0", "-i", list_file,
-        "-c", "copy", temp_concat, "-y"
     ], check=True)
     subprocess.run([
-        "ffmpeg", "-i", temp_concat, "-i", audio_path,
-        "-c:v", "copy", "-c:a", "aac", "-shortest", out_path, "-y"
     ], check=True)
-    # Cleanup intermediate concat assets
     for p in (list_file, temp_concat):
         try:
             os.remove(p)
         except OSError:
             pass
-def enhance_scene_prompt(base: str, global_style: str) -> str:
-    """
-    Add global style tags for coherence (camera, lighting).
-    """
-    return f"{base}. {global_style}"
-# ---------------- Main Generation Function (Gradio) ----------------
 def generate_video_from_topic(
     topic: str,
-    uploaded_keyframe: Optional[str],
     scene_count: int,
     clip_duration: int,
     ratio: str,
     progress=gr.Progress(track_tqdm=True)
 ) -> str:
     job = uid()
@@ -308,10 +333,9 @@ def generate_video_from_topic(
     try:
         if not topic or not topic.strip():
             raise gr.Error("Please provide a topic.")
         scene_count = max(1, min(MAX_SCENES, scene_count))
         if clip_duration not in ALLOWED_DURATIONS:
-            clip_duration = 5  # default safe value
         progress(0.05, desc="🔍 Researching topic...")
         facts = research_topic(topic)
@@ -321,55 +345,51 @@ def generate_video_from_topic(
         narration = script["narration"]
         scenes = script["scenes"]
-        progress(0.30, desc="🎙️ Creating mock voiceover...")
         audio_path = f"audio_{job}.mp3"
         temp_files.append(audio_path)
-        generate_mock_voiceover(narration, audio_path)
-        progress(0.40, desc="🖼️ Preparing keyframe(s)...")
-        if uploaded_keyframe:
-            prompt_image_path = uploaded_keyframe
         else:
             prompt_image_path = generate_placeholder_image(topic)
             temp_files.append(prompt_image_path)
-        # Convert image path to data URI (SDK also accepts URL; we use Data URI for local file)
-        with open(prompt_image_path, "rb") as f:
-            import base64
-            b64 = base64.b64encode(f.read()).decode("utf-8")
-            prompt_image = f"data:image/png;base64,{b64}"
-        global_style = "Cinematic, natural volumetric light, subtle camera motion, high coherence, 4k texture detail"
         video_clips: List[str] = []
         for idx, base_prompt in enumerate(scenes, start=1):
-            progress(0.40 + (0.45 * idx / scene_count),
-                     desc=f"🎬 Generating scene {idx}/{scene_count}...")
-            full_prompt = enhance_scene_prompt(base_prompt, global_style)
             try:
                 clip_path = runway_generate_clip(
-                    prompt_image=prompt_image,
                     text_prompt=full_prompt,
                     duration=clip_duration,
                     ratio=ratio
                 )
-                video_clips.append(clip_path)
-                temp_files.append(clip_path)
             except Exception as e:
-                log.error(f"Scene {idx} failed: {e}")
-                # Attempt one retry with a slightly modified prompt
-                retry_prompt = full_prompt + " -- consistent subject, refined detail"
-                try:
-                    clip_path = runway_generate_clip(
-                        prompt_image=prompt_image,
-                        text_prompt=retry_prompt,
-                        duration=clip_duration,
-                        ratio=ratio
-                    )
-                    video_clips.append(clip_path)
-                    temp_files.append(clip_path)
-                except Exception as e2:
-                    raise gr.Error(f"Scene {idx} failed after retry: {e2}")
         progress(0.92, desc="🧵 Stitching scenes...")
         final_out = f"{sanitize_filename(topic)}_{job}.mp4"
@@ -382,9 +402,8 @@ def generate_video_from_topic(
     except Exception as e:
         log.error(f"[AI-STUDIO] JOB {job} FAILED: {e}", exc_info=True)
         raise gr.Error(f"An error occurred: {e}")
     finally:
-        # Remove temporary artifacts but keep final video
         for p in temp_files:
             try:
                 if os.path.exists(p):
@@ -392,38 +411,75 @@ def generate_video_from_topic(
             except OSError:
                 pass
 # ---------------- Gradio UI ----------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🎬 AI Video Studio (Gen-4 Turbo)")
     gr.Markdown(
-        "Enter a topic and optionally upload a keyframe image. "
-        "The app will research, script, generate multi-scene Gen-4 Turbo clips, and stitch them."
     )
     with gr.Row():
         topic = gr.Textbox(label="Video Topic", placeholder="e.g., The history of coffee", scale=3)
         keyframe = gr.Image(type="filepath", label="Optional Keyframe (Image)", scale=2)
     with gr.Row():
         scene_count = gr.Slider(1, MAX_SCENES, value=DEFAULT_SCENES, step=1, label="Number of Scenes")
-        duration = gr.Radio(choices=[5, 10], value=5, label="Seconds per Scene")
-        ratio = gr.Dropdown(choices=[
-            "1280:720", "1920:1080", "1080:1920", "1024:1024"
-        ], value="1280:720", label="Aspect Ratio")
     generate_btn = gr.Button("🚀 Generate Video", variant="primary")
     output_video = gr.Video(label="Final Video")
     generate_btn.click(
-        fn=generate_video_from_topic,
-        inputs=[topic, keyframe, scene_count, duration, ratio],
         outputs=output_video
     )
-    gr.Markdown("### Tips\n"
-                "- Uploading a consistent character keyframe improves continuity.\n"
-                "- Use specific camera verbs: *slow dolly in*, *aerial sweep*, *handheld*.\n"
-                "- Add lighting adjectives: *golden hour*, *soft rim light*, *neon glow*.")
 if __name__ == "__main__":
     demo.launch()

 import json
 import time
 import random
 import logging
 import subprocess
 from pathlib import Path
 import gradio as gr
 from PIL import Image, ImageDraw, ImageFont
+# External SDKs
+import google.generativeai as genai              # Gemini
+from tavily import TavilyClient                  # Research enrichment
+from runwayml import RunwayML, TaskFailedError   # Runway official SDK
+from elevenlabs import ElevenLabs, VoiceSettings # ElevenLabs official SDK
 # ---------------- Logging Setup ----------------
 logging.basicConfig(
 )
 log = logging.getLogger("ai_video_studio")
+# ---------------- Configuration & Keys ----------------
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 RUNWAY_KEY = os.getenv("RUNWAY_API_KEY") or os.getenv("RUNWAYML_API_SECRET")
+ELEVEN_KEY = os.getenv("ELEVENLABS_API_KEY") or os.getenv("ELEVEN_API_KEY")
+REQUIRED = {
+    "GEMINI_API_KEY": GEMINI_API_KEY,
+    "TAVILY_API_KEY": TAVILY_API_KEY,
+    "RUNWAY_API_KEY / RUNWAYML_API_SECRET": RUNWAY_KEY,
+}
+missing = [k for k, v in REQUIRED.items() if not v]
+if missing:
     raise RuntimeError(f"Missing required API keys: {', '.join(missing)}")
+# ElevenLabs is optional; if absent we fall back to mock audio.
+ELEVEN_AVAILABLE = bool(ELEVEN_KEY)
+# Configure clients
 genai.configure(api_key=GEMINI_API_KEY)
 tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
 runway_client = RunwayML(api_key=RUNWAY_KEY)
+eleven_client: Optional[ElevenLabs] = ElevenLabs(api_key=ELEVEN_KEY) if ELEVEN_AVAILABLE else None
 # ---------------- Constants ----------------
 DEFAULT_SCENES = 4
 MAX_SCENES = 8
+WORDS_PER_SEC = 2.5  # heuristic for mock silent track
+ALLOWED_DURATIONS = {5, 10}
 PLACEHOLDER_BG = (18, 18, 22)
 PLACEHOLDER_FG = (239, 239, 245)
 FONT_CANDIDATES = [
     "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
     "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
 ]
+GLOBAL_STYLE = (
+    "Cinematic, natural volumetric light, subtle camera motion, high coherence, 4k texture detail"
+)
 # ---------------- Utility Functions ----------------
 def uid() -> str:
     return f"{int(time.time())}_{random.randint(1000, 9999)}"
 def sanitize_filename(name: str) -> str:
+    safe = "".join(c for c in name if c.isalnum() or c in ("-", "_"))[:64]
     return safe or "video"
 def generate_placeholder_image(topic: str, width: int = 768, height: int = 432) -> str:
     img = Image.new("RGB", (width, height), PLACEHOLDER_BG)
     draw = ImageDraw.Draw(img)
     font = None
     for path in FONT_CANDIDATES:
         if Path(path).exists():
             try:
+                font = ImageFont.truetype(path, 44)
                 break
             except Exception:
                 pass
     if font is None:
         font = ImageFont.load_default()
     words = topic.split()
+    lines = []
+    cur = []
+    max_chars = 22
     for w in words:
+        test = " ".join(cur + [w])
         if len(test) > max_chars:
+            lines.append(" ".join(cur))
+            cur = [w]
         else:
+            cur.append(w)
+    if cur:
+        lines.append(" ".join(cur))
+    total_h = 0
+    for ln in lines:
+        bbox = draw.textbbox((0, 0), ln, font=font)
+        total_h += (bbox[3] - bbox[1]) + 8
     y = (height - total_h) // 2
+    for ln in lines:
         bbox = draw.textbbox((0, 0), ln, font=font)
         w = bbox[2] - bbox[0]
         x = (width - w) // 2
         draw.text((x, y), ln, fill=PLACEHOLDER_FG, font=font)
+        y += (bbox[3] - bbox[1]) + 8
     out_path = f"placeholder_{uid()}.png"
     img.save(out_path)
             search_depth="basic"
         )
         if results and "results" in results:
+            return "
+".join(
+                str(r.get("content", "")).strip() for r in results["results"] if r.get("content")
             )
     except Exception as e:
         log.warning(f"Tavily failed: {e}")
     return "No supplemental research facts available."
 def gemini_script(topic: str, facts: str, scene_count: int) -> Dict[str, Any]:
     prompt = f"""
 You are a creative director for short-form educational / promotional videos.
 Supplemental Facts:
 {facts}
+Produce STRICT JSON with keys:
+  "narration_script": string
+  "scene_prompts": list[{scene_count}] of cinematic prompts (<=40 words each), no numbering.
+Each scene prompt MUST specify a consistent main subject, camera/movement, and lighting/mood.
+JSON ONLY, no markdown fences.
 """
     model = genai.GenerativeModel("gemini-1.5-flash")
     response = model.generate_content(prompt)
     raw = (response.text or "").strip()
     if raw.startswith("```"):
+        raw = raw.strip("`").lstrip("json").strip()
     data = None
     try:
         data = json.loads(raw)
     except json.JSONDecodeError:
+        start, end = raw.find("{"), raw.rfind("}")
         if start != -1 and end != -1:
             try:
                 data = json.loads(raw[start:end + 1])
                 pass
     if not isinstance(data, dict):
         raise gr.Error("Gemini did not return valid JSON structure.")
     narration = data.get("narration_script")
     scenes = data.get("scene_prompts")
     if isinstance(narration, list):
         narration = " ".join(map(str, narration))
     if not isinstance(narration, str) or not narration.strip():
         raise gr.Error("Invalid narration_script returned.")
     narration = narration.strip()
     if not isinstance(scenes, list):
         raise gr.Error("scene_prompts is not a list.")
     scenes = [str(s).strip() for s in scenes if str(s).strip()]
     if len(scenes) != scene_count:
         while len(scenes) < scene_count:
+            scenes.append(scenes[-1] if scenes else f"Establishing cinematic shot about {topic}")
         scenes = scenes[:scene_count]
     return {"narration": narration, "scenes": scenes}
+def ensure_duration(narration: str) -> float:
+    return max(2.0, min(300.0, len(narration.split()) / WORDS_PER_SEC))
+def mock_audio(narration: str, out_path: str) -> float:
+    duration = ensure_duration(narration)
     subprocess.run([
         "ffmpeg", "-f", "lavfi", "-i", "anullsrc=r=44100:cl=mono",
+        "-t", f"{duration:.2f}", "-q:a", "9", "-acodec", "libmp3lame", out_path, "-y"
     ], check=True)
     return duration
+def elevenlabs_tts(narration: str, voice_id: str, out_path: str, model: str, optimize_streaming_latency: int, use_stream: bool) -> float:
+    if not ELEVEN_AVAILABLE:
+        raise gr.Error("ElevenLabs API key not configured.")
+    # Streaming or non-streaming generation
+    if use_stream:
+        # Streaming: write chunks as they arrive
+        with open(out_path, "wb") as f:
+            for chunk in eleven_client.text_to_speech.convert(
+                voice_id=voice_id,
+                optimize_streaming_latency=optimize_streaming_latency,
+                model_id=model,
+                output_format="mp3_44100_128",
+                text=narration,
+                voice_settings=VoiceSettings(
+                    stability=0.5,
+                    similarity_boost=0.8,
+                    style=0.3,
+                    use_speaker_boost=True,
+                ),
+                stream=True,
+            ):
+                if isinstance(chunk, bytes):
+                    f.write(chunk)
+    else:
+        audio = eleven_client.text_to_speech.convert(
+            voice_id=voice_id,
+            model_id=model,
+            output_format="mp3_44100_128",
+            text=narration,
+            voice_settings=VoiceSettings(
+                stability=0.5,
+                similarity_boost=0.8,
+                style=0.3,
+                use_speaker_boost=True,
+            ),
+        )
+        with open(out_path, "wb") as f:
+            f.write(audio)
+    # Roughly compute duration from word count; could probe with ffprobe for exact.
+    return ensure_duration(narration)
+def list_elevenlabs_voices() -> List[Dict[str, str]]:
+    if not ELEVEN_AVAILABLE:
+        return []
+    try:
+        voices = eleven_client.voices.get_all()
+        out = []
+        for v in voices.voices:
+            out.append({"id": v.voice_id, "name": v.name})
+        return out
+    except Exception as e:
+        log.warning(f"Failed to list voices: {e}")
+        return []
+def build_prompt_image_data_uri(image_path: str) -> str:
+    import base64
+    with open(image_path, "rb") as f:
+        b64 = base64.b64encode(f.read()).decode("utf-8")
+    return f"data:image/png;base64,{b64}"
 def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, ratio: str) -> str:
     try:
         task = runway_client.image_to_video.create(
             model="gen4_turbo",
     except Exception as e:
         raise gr.Error(f"Failed to create Runway task: {e}")
     max_wait = 300
     interval = 5
     waited = 0
     outputs = getattr(task, "output", None)
     if not outputs or not isinstance(outputs, list):
         raise gr.Error("Runway returned no outputs.")
     video_url = outputs[0]
     import httpx
     clip_path = f"runway_clip_{uid()}.mp4"
     with httpx.stream("GET", video_url, timeout=120) as resp:
     return clip_path
 def concat_and_mux(video_paths: List[str], audio_path: str, out_path: str) -> None:
     list_file = f"concat_{uid()}.txt"
     with open(list_file, "w") as lf:
         for p in video_paths:
+            lf.write(f"file '{p}'
+")
     temp_concat = f"combined_{uid()}.mp4"
     subprocess.run([
+        "ffmpeg", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", temp_concat, "-y"
     ], check=True)
     subprocess.run([
+        "ffmpeg", "-i", temp_concat, "-i", audio_path, "-c:v", "copy", "-c:a", "aac", "-shortest", out_path, "-y"
     ], check=True)
     for p in (list_file, temp_concat):
         try:
             os.remove(p)
         except OSError:
             pass
+def enhance_scene_prompt(base: str) -> str:
+    return f"{base}. {GLOBAL_STYLE}"
+# ---------------- Main Generation Function ----------------
 def generate_video_from_topic(
     topic: str,
+    keyframe_image: Optional[str],
     scene_count: int,
     clip_duration: int,
     ratio: str,
+    use_eleven: bool,
+    eleven_voice: str,
+    eleven_model: str,
+    streaming: bool,
+    optimize_latency: int,
     progress=gr.Progress(track_tqdm=True)
 ) -> str:
     job = uid()
     try:
         if not topic or not topic.strip():
             raise gr.Error("Please provide a topic.")
         scene_count = max(1, min(MAX_SCENES, scene_count))
         if clip_duration not in ALLOWED_DURATIONS:
+            clip_duration = 5
         progress(0.05, desc="🔍 Researching topic...")
         facts = research_topic(topic)
         narration = script["narration"]
         scenes = script["scenes"]
+        progress(0.30, desc="🎙️ Generating narration audio...")
         audio_path = f"audio_{job}.mp3"
         temp_files.append(audio_path)
+        if use_eleven and ELEVEN_AVAILABLE:
+            elevenlabs_tts(
+                narration=narration,
+                voice_id=eleven_voice,
+                out_path=audio_path,
+                model=eleven_model,
+                optimize_streaming_latency=optimize_latency,
+                use_stream=streaming,
+            )
+        else:
+            mock_audio(narration, audio_path)
+        progress(0.40, desc="🖼️ Preparing keyframe image...")
+        if keyframe_image:
+            prompt_image_path = keyframe_image
         else:
             prompt_image_path = generate_placeholder_image(topic)
             temp_files.append(prompt_image_path)
+        prompt_image_data_uri = build_prompt_image_data_uri(prompt_image_path)
         video_clips: List[str] = []
         for idx, base_prompt in enumerate(scenes, start=1):
+            progress(0.40 + (0.45 * idx / scene_count), desc=f"🎬 Generating scene {idx}/{scene_count}...")
+            full_prompt = enhance_scene_prompt(base_prompt)
             try:
                 clip_path = runway_generate_clip(
+                    prompt_image=prompt_image_data_uri,
                     text_prompt=full_prompt,
                     duration=clip_duration,
                     ratio=ratio
                 )
             except Exception as e:
+                log.error(f"Scene {idx} failed: {e}; retrying once with refined prompt")
+                retry_prompt = full_prompt + " -- refined detail, consistent style"
+                clip_path = runway_generate_clip(
+                    prompt_image=prompt_image_data_uri,
+                    text_prompt=retry_prompt,
+                    duration=clip_duration,
+                    ratio=ratio
+                )
+            video_clips.append(clip_path)
+            temp_files.append(clip_path)
         progress(0.92, desc="🧵 Stitching scenes...")
         final_out = f"{sanitize_filename(topic)}_{job}.mp4"
     except Exception as e:
         log.error(f"[AI-STUDIO] JOB {job} FAILED: {e}", exc_info=True)
         raise gr.Error(f"An error occurred: {e}")
     finally:
+        # Clean up intermediate (keep keyframe if user uploaded it)
         for p in temp_files:
             try:
                 if os.path.exists(p):
             except OSError:
                 pass
+# ---------------- Voice Helper for UI ----------------
+def get_voice_choices() -> List[str]:
+    voices = list_elevenlabs_voices()
+    if not voices:
+        return ["eleven_monolingual_v1"]  # fallback placeholder id name pattern
+    return [f"{v['name']}|{v['id']}" for v in voices]
+VOICE_CHOICES = get_voice_choices()
+DEFAULT_VOICE = VOICE_CHOICES[0] if VOICE_CHOICES else "Rachel|21m00Tcm4TlvDq8ikWAM"  # Example default voice id pattern
 # ---------------- Gradio UI ----------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🎬 AI Video Studio (Runway Gen-4 Turbo + Gemini + ElevenLabs)")
     gr.Markdown(
+        "Generate a multi-scene AI video: research → script → voiceover (mock or ElevenLabs) → Gen-4 Turbo clips → stitch."
     )
     with gr.Row():
         topic = gr.Textbox(label="Video Topic", placeholder="e.g., The history of coffee", scale=3)
         keyframe = gr.Image(type="filepath", label="Optional Keyframe (Image)", scale=2)
+    with gr.Accordion("Narration Settings (ElevenLabs)", open=False):
+        use_eleven = gr.Checkbox(value=ELEVEN_AVAILABLE, label="Use ElevenLabs (falls back to mock if unchecked or unavailable)")
+        voice_select = gr.Dropdown(choices=VOICE_CHOICES, value=DEFAULT_VOICE, label="Voice (Name|ID)")
+        eleven_model = gr.Textbox(value="eleven_turbo_v2_5", label="ElevenLabs Model ID")
+        streaming = gr.Checkbox(value=True, label="Stream TTS (lower latency)")
+        optimize_latency = gr.Slider(0, 4, value=0, step=1, label="Optimize Streaming Latency (0=off, higher=more aggressive)")
     with gr.Row():
         scene_count = gr.Slider(1, MAX_SCENES, value=DEFAULT_SCENES, step=1, label="Number of Scenes")
+        duration = gr.Radio(choices=sorted(list(ALLOWED_DURATIONS)), value=5, label="Seconds per Scene")
+        ratio = gr.Dropdown(choices=["1280:720", "1920:1080", "1080:1920", "1024:1024"], value="1280:720", label="Aspect Ratio")
     generate_btn = gr.Button("🚀 Generate Video", variant="primary")
     output_video = gr.Video(label="Final Video")
+    def _parse_voice(v: str) -> str:
+        if "|" in v:
+            return v.split("|", 1)[1]
+        return v
+    def wrapper(topic, keyframe, scene_count, duration, ratio, use_eleven, voice_combo, eleven_model, streaming, optimize_latency):
+        voice_id = _parse_voice(voice_combo)
+        return generate_video_from_topic(
+            topic=topic,
+            keyframe_image=keyframe,
+            scene_count=scene_count,
+            clip_duration=int(duration),
+            ratio=ratio,
+            use_eleven=use_eleven,
+            eleven_voice=voice_id,
+            eleven_model=eleven_model.strip() or "eleven_turbo_v2_5",
+            streaming=streaming,
+            optimize_latency=int(optimize_latency),
+        )
     generate_btn.click(
+        fn=wrapper,
+        inputs=[topic, keyframe, scene_count, duration, ratio, use_eleven, voice_select, eleven_model, streaming, optimize_latency],
         outputs=output_video
     )
+    gr.Markdown("""---
+### Tips
+- Upload a keyframe to increase subject continuity.
+- Refine prompts by editing the generated scene prompts logic (extend code for manual review step).
+- ElevenLabs: if you get 401 errors, verify the API key and voice ID. For new voices, refresh the Space (reload) to repopulate the list.
+- Use 5s scenes for faster iteration; switch to 10s for final renders.
+""")
 if __name__ == "__main__":
     demo.launch()