mgbam commited on
Commit
0c28ab5
Β·
verified Β·
1 Parent(s): 31397a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +213 -157
app.py CHANGED
@@ -3,7 +3,6 @@ import io
3
  import json
4
  import time
5
  import random
6
- import string
7
  import logging
8
  import subprocess
9
  from pathlib import Path
@@ -12,10 +11,11 @@ from typing import List, Dict, Any, Optional
12
  import gradio as gr
13
  from PIL import Image, ImageDraw, ImageFont
14
 
15
- # --- External SDKs ---
16
- import google.generativeai as genai # Gemini (google-generativeai)
17
- from tavily import TavilyClient # Research enrichment
18
- from runwayml import RunwayML, TaskFailedError # Official Runway SDK
 
19
 
20
  # ---------------- Logging Setup ----------------
21
  logging.basicConfig(
@@ -25,86 +25,92 @@ logging.basicConfig(
25
  )
26
  log = logging.getLogger("ai_video_studio")
27
 
28
- # ---------------- Configuration ----------------
29
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
30
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
31
- # Allow either variable name for Runway:
32
  RUNWAY_KEY = os.getenv("RUNWAY_API_KEY") or os.getenv("RUNWAYML_API_SECRET")
33
-
34
- if not (GEMINI_API_KEY and TAVILY_API_KEY and RUNWAY_KEY):
35
- missing = [k for k, v in {
36
- "GEMINI_API_KEY": GEMINI_API_KEY,
37
- "TAVILY_API_KEY": TAVILY_API_KEY,
38
- "RUNWAY_API_KEY or RUNWAYML_API_SECRET": RUNWAY_KEY
39
- }.items() if not v]
 
 
40
  raise RuntimeError(f"Missing required API keys: {', '.join(missing)}")
41
 
 
 
 
 
42
  genai.configure(api_key=GEMINI_API_KEY)
43
  tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
44
  runway_client = RunwayML(api_key=RUNWAY_KEY)
 
45
 
46
  # ---------------- Constants ----------------
47
  DEFAULT_SCENES = 4
48
- WORDS_PER_SEC = 2.5 # heuristic for mock VO
49
  MAX_SCENES = 8
50
- ALLOWED_DURATIONS = {5, 10} # Gen-4 supported clip lengths
 
51
  PLACEHOLDER_BG = (18, 18, 22)
52
  PLACEHOLDER_FG = (239, 239, 245)
53
  FONT_CANDIDATES = [
54
  "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
55
  "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
56
  ]
 
 
 
57
 
58
  # ---------------- Utility Functions ----------------
59
  def uid() -> str:
60
  return f"{int(time.time())}_{random.randint(1000, 9999)}"
61
 
62
  def sanitize_filename(name: str) -> str:
63
- safe = "".join(c for c in name if c.isalnum() or c in ("-", "_"))[:50]
64
  return safe or "video"
65
 
66
  def generate_placeholder_image(topic: str, width: int = 768, height: int = 432) -> str:
67
- """
68
- Creates a simple placeholder keyframe with the topic text.
69
- Returns path to the PNG.
70
- """
71
  img = Image.new("RGB", (width, height), PLACEHOLDER_BG)
72
  draw = ImageDraw.Draw(img)
73
-
74
  font = None
75
  for path in FONT_CANDIDATES:
76
  if Path(path).exists():
77
  try:
78
- font = ImageFont.truetype(path, 42)
79
  break
80
  except Exception:
81
  pass
82
  if font is None:
83
  font = ImageFont.load_default()
84
 
85
- wrapped = []
86
  words = topic.split()
87
- line = []
88
- max_chars = 24
 
89
  for w in words:
90
- test = " ".join(line + [w])
91
  if len(test) > max_chars:
92
- wrapped.append(" ".join(line))
93
- line = [w]
94
  else:
95
- line.append(w)
96
- if line:
97
- wrapped.append(" ".join(line))
98
 
99
- total_h = sum(draw.textbbox((0, 0), ln, font=font)[3] - draw.textbbox((0, 0), ln, font=font)[1] + 10
100
- for ln in wrapped)
 
 
101
  y = (height - total_h) // 2
102
- for ln in wrapped:
103
  bbox = draw.textbbox((0, 0), ln, font=font)
104
  w = bbox[2] - bbox[0]
105
  x = (width - w) // 2
106
  draw.text((x, y), ln, fill=PLACEHOLDER_FG, font=font)
107
- y += (bbox[3] - bbox[1]) + 10
108
 
109
  out_path = f"placeholder_{uid()}.png"
110
  img.save(out_path)
@@ -117,21 +123,15 @@ def research_topic(topic: str) -> str:
117
  search_depth="basic"
118
  )
119
  if results and "results" in results:
120
- return "\n".join(
121
- str(r.get("content", "")).strip()
122
- for r in results["results"]
123
- if r.get("content")
124
  )
125
  except Exception as e:
126
  log.warning(f"Tavily failed: {e}")
127
  return "No supplemental research facts available."
128
 
129
  def gemini_script(topic: str, facts: str, scene_count: int) -> Dict[str, Any]:
130
- """
131
- Ask Gemini for structured JSON (narration + scene prompts).
132
- Includes fallback parsing if schema drifts.
133
- """
134
- # Base prompt (schema hint)
135
  prompt = f"""
136
  You are a creative director for short-form educational / promotional videos.
137
 
@@ -140,36 +140,22 @@ Topic: {topic}
140
  Supplemental Facts:
141
  {facts}
142
 
143
- Produce STRICT JSON with:
144
- "narration_script": string # a cohesive narration referencing key facts succinctly
145
- "scene_prompts": list[{scene_count}] # exactly {scene_count} cinematic, image-to-video prompts.
146
-
147
- Each scene prompt MUST:
148
- - Specify a consistent main subject (if applicable).
149
- - Include a camera or movement descriptor (e.g. "slow dolly in", "aerial shot", "handheld").
150
- - Mention lighting or mood.
151
- - Be <= 40 words, no leading numbering.
152
- JSON ONLY. No markdown fences.
153
  """
154
  model = genai.GenerativeModel("gemini-1.5-flash")
155
  response = model.generate_content(prompt)
156
-
157
  raw = (response.text or "").strip()
158
- # Fallback: remove code fences if present
159
  if raw.startswith("```"):
160
- raw = raw.strip("`")
161
- # remove potential language spec lines
162
- if raw.lower().startswith("json"):
163
- raw = raw[4:].strip()
164
-
165
- # Attempt direct parse
166
  data = None
167
  try:
168
  data = json.loads(raw)
169
  except json.JSONDecodeError:
170
- # Try to extract first {...} block heuristically
171
- start = raw.find("{")
172
- end = raw.rfind("}")
173
  if start != -1 and end != -1:
174
  try:
175
  data = json.loads(raw[start:end + 1])
@@ -177,45 +163,94 @@ JSON ONLY. No markdown fences.
177
  pass
178
  if not isinstance(data, dict):
179
  raise gr.Error("Gemini did not return valid JSON structure.")
180
-
181
  narration = data.get("narration_script")
182
  scenes = data.get("scene_prompts")
183
-
184
- # Normalize narration
185
  if isinstance(narration, list):
186
  narration = " ".join(map(str, narration))
187
  if not isinstance(narration, str) or not narration.strip():
188
  raise gr.Error("Invalid narration_script returned.")
189
  narration = narration.strip()
190
-
191
- # Normalize scenes
192
  if not isinstance(scenes, list):
193
  raise gr.Error("scene_prompts is not a list.")
194
  scenes = [str(s).strip() for s in scenes if str(s).strip()]
195
  if len(scenes) != scene_count:
196
- # If mismatch, truncate or pad with variants
197
  while len(scenes) < scene_count:
198
- scenes.append(scenes[-1] if scenes else f"Cinematic establishing shot about {topic}")
199
  scenes = scenes[:scene_count]
200
-
201
  return {"narration": narration, "scenes": scenes}
202
 
203
- def generate_mock_voiceover(narration: str, out_path: str) -> float:
204
- """
205
- Create silent (mock) audio track sized to narration length.
206
- """
207
- duration = max(2.0, min(300.0, len(narration.split()) / WORDS_PER_SEC))
208
  subprocess.run([
209
  "ffmpeg", "-f", "lavfi", "-i", "anullsrc=r=44100:cl=mono",
210
- "-t", f"{duration:.2f}", "-q:a", "9", "-acodec", "libmp3lame",
211
- out_path, "-y"
212
  ], check=True)
213
  return duration
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, ratio: str) -> str:
216
- """
217
- Launch an image_to_video task and return the downloaded file path.
218
- """
219
  try:
220
  task = runway_client.image_to_video.create(
221
  model="gen4_turbo",
@@ -227,7 +262,6 @@ def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, rat
227
  except Exception as e:
228
  raise gr.Error(f"Failed to create Runway task: {e}")
229
 
230
- # Poll until completion
231
  max_wait = 300
232
  interval = 5
233
  waited = 0
@@ -246,10 +280,8 @@ def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, rat
246
  outputs = getattr(task, "output", None)
247
  if not outputs or not isinstance(outputs, list):
248
  raise gr.Error("Runway returned no outputs.")
249
-
250
  video_url = outputs[0]
251
 
252
- # Download
253
  import httpx
254
  clip_path = f"runway_clip_{uid()}.mp4"
255
  with httpx.stream("GET", video_url, timeout=120) as resp:
@@ -260,46 +292,39 @@ def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, rat
260
  return clip_path
261
 
262
  def concat_and_mux(video_paths: List[str], audio_path: str, out_path: str) -> None:
263
- """
264
- Concatenate MP4 clips (same codec) and mux with audio.
265
- """
266
- # Create concat file
267
  list_file = f"concat_{uid()}.txt"
268
  with open(list_file, "w") as lf:
269
  for p in video_paths:
270
- lf.write(f"file '{p}'\n")
271
-
272
  temp_concat = f"combined_{uid()}.mp4"
273
  subprocess.run([
274
- "ffmpeg", "-f", "concat", "-safe", "0", "-i", list_file,
275
- "-c", "copy", temp_concat, "-y"
276
  ], check=True)
277
-
278
  subprocess.run([
279
- "ffmpeg", "-i", temp_concat, "-i", audio_path,
280
- "-c:v", "copy", "-c:a", "aac", "-shortest", out_path, "-y"
281
  ], check=True)
282
-
283
- # Cleanup intermediate concat assets
284
  for p in (list_file, temp_concat):
285
  try:
286
  os.remove(p)
287
  except OSError:
288
  pass
289
 
290
- def enhance_scene_prompt(base: str, global_style: str) -> str:
291
- """
292
- Add global style tags for coherence (camera, lighting).
293
- """
294
- return f"{base}. {global_style}"
295
 
296
- # ---------------- Main Generation Function (Gradio) ----------------
297
  def generate_video_from_topic(
298
  topic: str,
299
- uploaded_keyframe: Optional[str],
300
  scene_count: int,
301
  clip_duration: int,
302
  ratio: str,
 
 
 
 
 
303
  progress=gr.Progress(track_tqdm=True)
304
  ) -> str:
305
  job = uid()
@@ -308,10 +333,9 @@ def generate_video_from_topic(
308
  try:
309
  if not topic or not topic.strip():
310
  raise gr.Error("Please provide a topic.")
311
-
312
  scene_count = max(1, min(MAX_SCENES, scene_count))
313
  if clip_duration not in ALLOWED_DURATIONS:
314
- clip_duration = 5 # default safe value
315
 
316
  progress(0.05, desc="πŸ” Researching topic...")
317
  facts = research_topic(topic)
@@ -321,55 +345,51 @@ def generate_video_from_topic(
321
  narration = script["narration"]
322
  scenes = script["scenes"]
323
 
324
- progress(0.30, desc="πŸŽ™οΈ Creating mock voiceover...")
325
  audio_path = f"audio_{job}.mp3"
326
  temp_files.append(audio_path)
327
- generate_mock_voiceover(narration, audio_path)
 
 
 
 
 
 
 
 
 
 
328
 
329
- progress(0.40, desc="πŸ–ΌοΈ Preparing keyframe(s)...")
330
- if uploaded_keyframe:
331
- prompt_image_path = uploaded_keyframe
332
  else:
333
  prompt_image_path = generate_placeholder_image(topic)
334
  temp_files.append(prompt_image_path)
335
-
336
- # Convert image path to data URI (SDK also accepts URL; we use Data URI for local file)
337
- with open(prompt_image_path, "rb") as f:
338
- import base64
339
- b64 = base64.b64encode(f.read()).decode("utf-8")
340
- prompt_image = f"data:image/png;base64,{b64}"
341
-
342
- global_style = "Cinematic, natural volumetric light, subtle camera motion, high coherence, 4k texture detail"
343
 
344
  video_clips: List[str] = []
345
  for idx, base_prompt in enumerate(scenes, start=1):
346
- progress(0.40 + (0.45 * idx / scene_count),
347
- desc=f"🎬 Generating scene {idx}/{scene_count}...")
348
- full_prompt = enhance_scene_prompt(base_prompt, global_style)
349
  try:
350
  clip_path = runway_generate_clip(
351
- prompt_image=prompt_image,
352
  text_prompt=full_prompt,
353
  duration=clip_duration,
354
  ratio=ratio
355
  )
356
- video_clips.append(clip_path)
357
- temp_files.append(clip_path)
358
  except Exception as e:
359
- log.error(f"Scene {idx} failed: {e}")
360
- # Attempt one retry with a slightly modified prompt
361
- retry_prompt = full_prompt + " -- consistent subject, refined detail"
362
- try:
363
- clip_path = runway_generate_clip(
364
- prompt_image=prompt_image,
365
- text_prompt=retry_prompt,
366
- duration=clip_duration,
367
- ratio=ratio
368
- )
369
- video_clips.append(clip_path)
370
- temp_files.append(clip_path)
371
- except Exception as e2:
372
- raise gr.Error(f"Scene {idx} failed after retry: {e2}")
373
 
374
  progress(0.92, desc="🧡 Stitching scenes...")
375
  final_out = f"{sanitize_filename(topic)}_{job}.mp4"
@@ -382,9 +402,8 @@ def generate_video_from_topic(
382
  except Exception as e:
383
  log.error(f"[AI-STUDIO] JOB {job} FAILED: {e}", exc_info=True)
384
  raise gr.Error(f"An error occurred: {e}")
385
-
386
  finally:
387
- # Remove temporary artifacts but keep final video
388
  for p in temp_files:
389
  try:
390
  if os.path.exists(p):
@@ -392,38 +411,75 @@ def generate_video_from_topic(
392
  except OSError:
393
  pass
394
 
 
 
 
 
 
 
 
 
 
 
395
  # ---------------- Gradio UI ----------------
396
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
397
- gr.Markdown("# 🎬 AI Video Studio (Gen-4 Turbo)")
398
  gr.Markdown(
399
- "Enter a topic and optionally upload a keyframe image. "
400
- "The app will research, script, generate multi-scene Gen-4 Turbo clips, and stitch them."
401
  )
402
 
403
  with gr.Row():
404
  topic = gr.Textbox(label="Video Topic", placeholder="e.g., The history of coffee", scale=3)
405
  keyframe = gr.Image(type="filepath", label="Optional Keyframe (Image)", scale=2)
406
 
 
 
 
 
 
 
 
407
  with gr.Row():
408
  scene_count = gr.Slider(1, MAX_SCENES, value=DEFAULT_SCENES, step=1, label="Number of Scenes")
409
- duration = gr.Radio(choices=[5, 10], value=5, label="Seconds per Scene")
410
- ratio = gr.Dropdown(choices=[
411
- "1280:720", "1920:1080", "1080:1920", "1024:1024"
412
- ], value="1280:720", label="Aspect Ratio")
413
 
414
  generate_btn = gr.Button("πŸš€ Generate Video", variant="primary")
415
  output_video = gr.Video(label="Final Video")
416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
  generate_btn.click(
418
- fn=generate_video_from_topic,
419
- inputs=[topic, keyframe, scene_count, duration, ratio],
420
  outputs=output_video
421
  )
422
 
423
- gr.Markdown("### Tips\n"
424
- "- Uploading a consistent character keyframe improves continuity.\n"
425
- "- Use specific camera verbs: *slow dolly in*, *aerial sweep*, *handheld*.\n"
426
- "- Add lighting adjectives: *golden hour*, *soft rim light*, *neon glow*.")
 
 
 
427
 
428
  if __name__ == "__main__":
429
  demo.launch()
 
3
  import json
4
  import time
5
  import random
 
6
  import logging
7
  import subprocess
8
  from pathlib import Path
 
11
  import gradio as gr
12
  from PIL import Image, ImageDraw, ImageFont
13
 
14
+ # External SDKs
15
+ import google.generativeai as genai # Gemini
16
+ from tavily import TavilyClient # Research enrichment
17
+ from runwayml import RunwayML, TaskFailedError # Runway official SDK
18
+ from elevenlabs import ElevenLabs, VoiceSettings # ElevenLabs official SDK
19
 
20
  # ---------------- Logging Setup ----------------
21
  logging.basicConfig(
 
25
  )
26
  log = logging.getLogger("ai_video_studio")
27
 
28
+ # ---------------- Configuration & Keys ----------------
29
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
30
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 
31
  RUNWAY_KEY = os.getenv("RUNWAY_API_KEY") or os.getenv("RUNWAYML_API_SECRET")
32
+ ELEVEN_KEY = os.getenv("ELEVENLABS_API_KEY") or os.getenv("ELEVEN_API_KEY")
33
+
34
+ REQUIRED = {
35
+ "GEMINI_API_KEY": GEMINI_API_KEY,
36
+ "TAVILY_API_KEY": TAVILY_API_KEY,
37
+ "RUNWAY_API_KEY / RUNWAYML_API_SECRET": RUNWAY_KEY,
38
+ }
39
+ missing = [k for k, v in REQUIRED.items() if not v]
40
+ if missing:
41
  raise RuntimeError(f"Missing required API keys: {', '.join(missing)}")
42
 
43
+ # ElevenLabs is optional; if absent we fall back to mock audio.
44
+ ELEVEN_AVAILABLE = bool(ELEVEN_KEY)
45
+
46
+ # Configure clients
47
  genai.configure(api_key=GEMINI_API_KEY)
48
  tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
49
  runway_client = RunwayML(api_key=RUNWAY_KEY)
50
+ eleven_client: Optional[ElevenLabs] = ElevenLabs(api_key=ELEVEN_KEY) if ELEVEN_AVAILABLE else None
51
 
52
  # ---------------- Constants ----------------
53
  DEFAULT_SCENES = 4
 
54
  MAX_SCENES = 8
55
+ WORDS_PER_SEC = 2.5 # heuristic for mock silent track
56
+ ALLOWED_DURATIONS = {5, 10}
57
  PLACEHOLDER_BG = (18, 18, 22)
58
  PLACEHOLDER_FG = (239, 239, 245)
59
  FONT_CANDIDATES = [
60
  "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
61
  "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
62
  ]
63
+ GLOBAL_STYLE = (
64
+ "Cinematic, natural volumetric light, subtle camera motion, high coherence, 4k texture detail"
65
+ )
66
 
67
  # ---------------- Utility Functions ----------------
68
  def uid() -> str:
69
  return f"{int(time.time())}_{random.randint(1000, 9999)}"
70
 
71
  def sanitize_filename(name: str) -> str:
72
+ safe = "".join(c for c in name if c.isalnum() or c in ("-", "_"))[:64]
73
  return safe or "video"
74
 
75
  def generate_placeholder_image(topic: str, width: int = 768, height: int = 432) -> str:
 
 
 
 
76
  img = Image.new("RGB", (width, height), PLACEHOLDER_BG)
77
  draw = ImageDraw.Draw(img)
 
78
  font = None
79
  for path in FONT_CANDIDATES:
80
  if Path(path).exists():
81
  try:
82
+ font = ImageFont.truetype(path, 44)
83
  break
84
  except Exception:
85
  pass
86
  if font is None:
87
  font = ImageFont.load_default()
88
 
 
89
  words = topic.split()
90
+ lines = []
91
+ cur = []
92
+ max_chars = 22
93
  for w in words:
94
+ test = " ".join(cur + [w])
95
  if len(test) > max_chars:
96
+ lines.append(" ".join(cur))
97
+ cur = [w]
98
  else:
99
+ cur.append(w)
100
+ if cur:
101
+ lines.append(" ".join(cur))
102
 
103
+ total_h = 0
104
+ for ln in lines:
105
+ bbox = draw.textbbox((0, 0), ln, font=font)
106
+ total_h += (bbox[3] - bbox[1]) + 8
107
  y = (height - total_h) // 2
108
+ for ln in lines:
109
  bbox = draw.textbbox((0, 0), ln, font=font)
110
  w = bbox[2] - bbox[0]
111
  x = (width - w) // 2
112
  draw.text((x, y), ln, fill=PLACEHOLDER_FG, font=font)
113
+ y += (bbox[3] - bbox[1]) + 8
114
 
115
  out_path = f"placeholder_{uid()}.png"
116
  img.save(out_path)
 
123
  search_depth="basic"
124
  )
125
  if results and "results" in results:
126
+ return "
127
+ ".join(
128
+ str(r.get("content", "")).strip() for r in results["results"] if r.get("content")
 
129
  )
130
  except Exception as e:
131
  log.warning(f"Tavily failed: {e}")
132
  return "No supplemental research facts available."
133
 
134
  def gemini_script(topic: str, facts: str, scene_count: int) -> Dict[str, Any]:
 
 
 
 
 
135
  prompt = f"""
136
  You are a creative director for short-form educational / promotional videos.
137
 
 
140
  Supplemental Facts:
141
  {facts}
142
 
143
+ Produce STRICT JSON with keys:
144
+ "narration_script": string
145
+ "scene_prompts": list[{scene_count}] of cinematic prompts (<=40 words each), no numbering.
146
+ Each scene prompt MUST specify a consistent main subject, camera/movement, and lighting/mood.
147
+ JSON ONLY, no markdown fences.
 
 
 
 
 
148
  """
149
  model = genai.GenerativeModel("gemini-1.5-flash")
150
  response = model.generate_content(prompt)
 
151
  raw = (response.text or "").strip()
 
152
  if raw.startswith("```"):
153
+ raw = raw.strip("`").lstrip("json").strip()
 
 
 
 
 
154
  data = None
155
  try:
156
  data = json.loads(raw)
157
  except json.JSONDecodeError:
158
+ start, end = raw.find("{"), raw.rfind("}")
 
 
159
  if start != -1 and end != -1:
160
  try:
161
  data = json.loads(raw[start:end + 1])
 
163
  pass
164
  if not isinstance(data, dict):
165
  raise gr.Error("Gemini did not return valid JSON structure.")
 
166
  narration = data.get("narration_script")
167
  scenes = data.get("scene_prompts")
 
 
168
  if isinstance(narration, list):
169
  narration = " ".join(map(str, narration))
170
  if not isinstance(narration, str) or not narration.strip():
171
  raise gr.Error("Invalid narration_script returned.")
172
  narration = narration.strip()
 
 
173
  if not isinstance(scenes, list):
174
  raise gr.Error("scene_prompts is not a list.")
175
  scenes = [str(s).strip() for s in scenes if str(s).strip()]
176
  if len(scenes) != scene_count:
 
177
  while len(scenes) < scene_count:
178
+ scenes.append(scenes[-1] if scenes else f"Establishing cinematic shot about {topic}")
179
  scenes = scenes[:scene_count]
 
180
  return {"narration": narration, "scenes": scenes}
181
 
182
+ def ensure_duration(narration: str) -> float:
183
+ return max(2.0, min(300.0, len(narration.split()) / WORDS_PER_SEC))
184
+
185
+ def mock_audio(narration: str, out_path: str) -> float:
186
+ duration = ensure_duration(narration)
187
  subprocess.run([
188
  "ffmpeg", "-f", "lavfi", "-i", "anullsrc=r=44100:cl=mono",
189
+ "-t", f"{duration:.2f}", "-q:a", "9", "-acodec", "libmp3lame", out_path, "-y"
 
190
  ], check=True)
191
  return duration
192
 
193
+ def elevenlabs_tts(narration: str, voice_id: str, out_path: str, model: str, optimize_streaming_latency: int, use_stream: bool) -> float:
194
+ if not ELEVEN_AVAILABLE:
195
+ raise gr.Error("ElevenLabs API key not configured.")
196
+ # Streaming or non-streaming generation
197
+ if use_stream:
198
+ # Streaming: write chunks as they arrive
199
+ with open(out_path, "wb") as f:
200
+ for chunk in eleven_client.text_to_speech.convert(
201
+ voice_id=voice_id,
202
+ optimize_streaming_latency=optimize_streaming_latency,
203
+ model_id=model,
204
+ output_format="mp3_44100_128",
205
+ text=narration,
206
+ voice_settings=VoiceSettings(
207
+ stability=0.5,
208
+ similarity_boost=0.8,
209
+ style=0.3,
210
+ use_speaker_boost=True,
211
+ ),
212
+ stream=True,
213
+ ):
214
+ if isinstance(chunk, bytes):
215
+ f.write(chunk)
216
+ else:
217
+ audio = eleven_client.text_to_speech.convert(
218
+ voice_id=voice_id,
219
+ model_id=model,
220
+ output_format="mp3_44100_128",
221
+ text=narration,
222
+ voice_settings=VoiceSettings(
223
+ stability=0.5,
224
+ similarity_boost=0.8,
225
+ style=0.3,
226
+ use_speaker_boost=True,
227
+ ),
228
+ )
229
+ with open(out_path, "wb") as f:
230
+ f.write(audio)
231
+ # Roughly compute duration from word count; could probe with ffprobe for exact.
232
+ return ensure_duration(narration)
233
+
234
+ def list_elevenlabs_voices() -> List[Dict[str, str]]:
235
+ if not ELEVEN_AVAILABLE:
236
+ return []
237
+ try:
238
+ voices = eleven_client.voices.get_all()
239
+ out = []
240
+ for v in voices.voices:
241
+ out.append({"id": v.voice_id, "name": v.name})
242
+ return out
243
+ except Exception as e:
244
+ log.warning(f"Failed to list voices: {e}")
245
+ return []
246
+
247
+ def build_prompt_image_data_uri(image_path: str) -> str:
248
+ import base64
249
+ with open(image_path, "rb") as f:
250
+ b64 = base64.b64encode(f.read()).decode("utf-8")
251
+ return f"data:image/png;base64,{b64}"
252
+
253
  def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, ratio: str) -> str:
 
 
 
254
  try:
255
  task = runway_client.image_to_video.create(
256
  model="gen4_turbo",
 
262
  except Exception as e:
263
  raise gr.Error(f"Failed to create Runway task: {e}")
264
 
 
265
  max_wait = 300
266
  interval = 5
267
  waited = 0
 
280
  outputs = getattr(task, "output", None)
281
  if not outputs or not isinstance(outputs, list):
282
  raise gr.Error("Runway returned no outputs.")
 
283
  video_url = outputs[0]
284
 
 
285
  import httpx
286
  clip_path = f"runway_clip_{uid()}.mp4"
287
  with httpx.stream("GET", video_url, timeout=120) as resp:
 
292
  return clip_path
293
 
294
  def concat_and_mux(video_paths: List[str], audio_path: str, out_path: str) -> None:
 
 
 
 
295
  list_file = f"concat_{uid()}.txt"
296
  with open(list_file, "w") as lf:
297
  for p in video_paths:
298
+ lf.write(f"file '{p}'
299
+ ")
300
  temp_concat = f"combined_{uid()}.mp4"
301
  subprocess.run([
302
+ "ffmpeg", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", temp_concat, "-y"
 
303
  ], check=True)
 
304
  subprocess.run([
305
+ "ffmpeg", "-i", temp_concat, "-i", audio_path, "-c:v", "copy", "-c:a", "aac", "-shortest", out_path, "-y"
 
306
  ], check=True)
 
 
307
  for p in (list_file, temp_concat):
308
  try:
309
  os.remove(p)
310
  except OSError:
311
  pass
312
 
313
+ def enhance_scene_prompt(base: str) -> str:
314
+ return f"{base}. {GLOBAL_STYLE}"
 
 
 
315
 
316
+ # ---------------- Main Generation Function ----------------
317
  def generate_video_from_topic(
318
  topic: str,
319
+ keyframe_image: Optional[str],
320
  scene_count: int,
321
  clip_duration: int,
322
  ratio: str,
323
+ use_eleven: bool,
324
+ eleven_voice: str,
325
+ eleven_model: str,
326
+ streaming: bool,
327
+ optimize_latency: int,
328
  progress=gr.Progress(track_tqdm=True)
329
  ) -> str:
330
  job = uid()
 
333
  try:
334
  if not topic or not topic.strip():
335
  raise gr.Error("Please provide a topic.")
 
336
  scene_count = max(1, min(MAX_SCENES, scene_count))
337
  if clip_duration not in ALLOWED_DURATIONS:
338
+ clip_duration = 5
339
 
340
  progress(0.05, desc="πŸ” Researching topic...")
341
  facts = research_topic(topic)
 
345
  narration = script["narration"]
346
  scenes = script["scenes"]
347
 
348
+ progress(0.30, desc="πŸŽ™οΈ Generating narration audio...")
349
  audio_path = f"audio_{job}.mp3"
350
  temp_files.append(audio_path)
351
+ if use_eleven and ELEVEN_AVAILABLE:
352
+ elevenlabs_tts(
353
+ narration=narration,
354
+ voice_id=eleven_voice,
355
+ out_path=audio_path,
356
+ model=eleven_model,
357
+ optimize_streaming_latency=optimize_latency,
358
+ use_stream=streaming,
359
+ )
360
+ else:
361
+ mock_audio(narration, audio_path)
362
 
363
+ progress(0.40, desc="πŸ–ΌοΈ Preparing keyframe image...")
364
+ if keyframe_image:
365
+ prompt_image_path = keyframe_image
366
  else:
367
  prompt_image_path = generate_placeholder_image(topic)
368
  temp_files.append(prompt_image_path)
369
+ prompt_image_data_uri = build_prompt_image_data_uri(prompt_image_path)
 
 
 
 
 
 
 
370
 
371
  video_clips: List[str] = []
372
  for idx, base_prompt in enumerate(scenes, start=1):
373
+ progress(0.40 + (0.45 * idx / scene_count), desc=f"🎬 Generating scene {idx}/{scene_count}...")
374
+ full_prompt = enhance_scene_prompt(base_prompt)
 
375
  try:
376
  clip_path = runway_generate_clip(
377
+ prompt_image=prompt_image_data_uri,
378
  text_prompt=full_prompt,
379
  duration=clip_duration,
380
  ratio=ratio
381
  )
 
 
382
  except Exception as e:
383
+ log.error(f"Scene {idx} failed: {e}; retrying once with refined prompt")
384
+ retry_prompt = full_prompt + " -- refined detail, consistent style"
385
+ clip_path = runway_generate_clip(
386
+ prompt_image=prompt_image_data_uri,
387
+ text_prompt=retry_prompt,
388
+ duration=clip_duration,
389
+ ratio=ratio
390
+ )
391
+ video_clips.append(clip_path)
392
+ temp_files.append(clip_path)
 
 
 
 
393
 
394
  progress(0.92, desc="🧡 Stitching scenes...")
395
  final_out = f"{sanitize_filename(topic)}_{job}.mp4"
 
402
  except Exception as e:
403
  log.error(f"[AI-STUDIO] JOB {job} FAILED: {e}", exc_info=True)
404
  raise gr.Error(f"An error occurred: {e}")
 
405
  finally:
406
+ # Clean up intermediate (keep keyframe if user uploaded it)
407
  for p in temp_files:
408
  try:
409
  if os.path.exists(p):
 
411
  except OSError:
412
  pass
413
 
414
+ # ---------------- Voice Helper for UI ----------------
415
+ def get_voice_choices() -> List[str]:
416
+ voices = list_elevenlabs_voices()
417
+ if not voices:
418
+ return ["eleven_monolingual_v1"] # fallback placeholder id name pattern
419
+ return [f"{v['name']}|{v['id']}" for v in voices]
420
+
421
+ VOICE_CHOICES = get_voice_choices()
422
+ DEFAULT_VOICE = VOICE_CHOICES[0] if VOICE_CHOICES else "Rachel|21m00Tcm4TlvDq8ikWAM" # Example default voice id pattern
423
+
424
  # ---------------- Gradio UI ----------------
425
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
426
+ gr.Markdown("# 🎬 AI Video Studio (Runway Gen-4 Turbo + Gemini + ElevenLabs)")
427
  gr.Markdown(
428
+ "Generate a multi-scene AI video: research β†’ script β†’ voiceover (mock or ElevenLabs) β†’ Gen-4 Turbo clips β†’ stitch."
 
429
  )
430
 
431
  with gr.Row():
432
  topic = gr.Textbox(label="Video Topic", placeholder="e.g., The history of coffee", scale=3)
433
  keyframe = gr.Image(type="filepath", label="Optional Keyframe (Image)", scale=2)
434
 
435
+ with gr.Accordion("Narration Settings (ElevenLabs)", open=False):
436
+ use_eleven = gr.Checkbox(value=ELEVEN_AVAILABLE, label="Use ElevenLabs (falls back to mock if unchecked or unavailable)")
437
+ voice_select = gr.Dropdown(choices=VOICE_CHOICES, value=DEFAULT_VOICE, label="Voice (Name|ID)")
438
+ eleven_model = gr.Textbox(value="eleven_turbo_v2_5", label="ElevenLabs Model ID")
439
+ streaming = gr.Checkbox(value=True, label="Stream TTS (lower latency)")
440
+ optimize_latency = gr.Slider(0, 4, value=0, step=1, label="Optimize Streaming Latency (0=off, higher=more aggressive)")
441
+
442
  with gr.Row():
443
  scene_count = gr.Slider(1, MAX_SCENES, value=DEFAULT_SCENES, step=1, label="Number of Scenes")
444
+ duration = gr.Radio(choices=sorted(list(ALLOWED_DURATIONS)), value=5, label="Seconds per Scene")
445
+ ratio = gr.Dropdown(choices=["1280:720", "1920:1080", "1080:1920", "1024:1024"], value="1280:720", label="Aspect Ratio")
 
 
446
 
447
  generate_btn = gr.Button("πŸš€ Generate Video", variant="primary")
448
  output_video = gr.Video(label="Final Video")
449
 
450
+ def _parse_voice(v: str) -> str:
451
+ if "|" in v:
452
+ return v.split("|", 1)[1]
453
+ return v
454
+
455
+ def wrapper(topic, keyframe, scene_count, duration, ratio, use_eleven, voice_combo, eleven_model, streaming, optimize_latency):
456
+ voice_id = _parse_voice(voice_combo)
457
+ return generate_video_from_topic(
458
+ topic=topic,
459
+ keyframe_image=keyframe,
460
+ scene_count=scene_count,
461
+ clip_duration=int(duration),
462
+ ratio=ratio,
463
+ use_eleven=use_eleven,
464
+ eleven_voice=voice_id,
465
+ eleven_model=eleven_model.strip() or "eleven_turbo_v2_5",
466
+ streaming=streaming,
467
+ optimize_latency=int(optimize_latency),
468
+ )
469
+
470
  generate_btn.click(
471
+ fn=wrapper,
472
+ inputs=[topic, keyframe, scene_count, duration, ratio, use_eleven, voice_select, eleven_model, streaming, optimize_latency],
473
  outputs=output_video
474
  )
475
 
476
+ gr.Markdown("""---
477
+ ### Tips
478
+ - Upload a keyframe to increase subject continuity.
479
+ - Refine prompts by editing the generated scene prompts logic (extend code for manual review step).
480
+ - ElevenLabs: if you get 401 errors, verify the API key and voice ID. For new voices, refresh the Space (reload) to repopulate the list.
481
+ - Use 5s scenes for faster iteration; switch to 10s for final renders.
482
+ """)
483
 
484
  if __name__ == "__main__":
485
  demo.launch()