mgbam commited on
Commit
9c2d4ce
Β·
verified Β·
1 Parent(s): 0c28ab5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +253 -207
app.py CHANGED
@@ -12,12 +12,14 @@ import gradio as gr
12
  from PIL import Image, ImageDraw, ImageFont
13
 
14
  # External SDKs
15
- import google.generativeai as genai # Gemini
16
- from tavily import TavilyClient # Research enrichment
17
- from runwayml import RunwayML, TaskFailedError # Runway official SDK
18
- from elevenlabs import ElevenLabs, VoiceSettings # ElevenLabs official SDK
19
-
20
- # ---------------- Logging Setup ----------------
 
 
21
  logging.basicConfig(
22
  level=logging.INFO,
23
  format="[%(levelname)s %(asctime)s] %(message)s",
@@ -25,113 +27,113 @@ logging.basicConfig(
25
  )
26
  log = logging.getLogger("ai_video_studio")
27
 
28
- # ---------------- Configuration & Keys ----------------
29
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
30
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
31
  RUNWAY_KEY = os.getenv("RUNWAY_API_KEY") or os.getenv("RUNWAYML_API_SECRET")
32
- ELEVEN_KEY = os.getenv("ELEVENLABS_API_KEY") or os.getenv("ELEVEN_API_KEY")
33
 
34
- REQUIRED = {
35
  "GEMINI_API_KEY": GEMINI_API_KEY,
36
  "TAVILY_API_KEY": TAVILY_API_KEY,
37
- "RUNWAY_API_KEY / RUNWAYML_API_SECRET": RUNWAY_KEY,
38
- }
39
- missing = [k for k, v in REQUIRED.items() if not v]
40
  if missing:
41
  raise RuntimeError(f"Missing required API keys: {', '.join(missing)}")
42
 
43
- # ElevenLabs is optional; if absent we fall back to mock audio.
44
- ELEVEN_AVAILABLE = bool(ELEVEN_KEY)
45
-
46
- # Configure clients
47
  genai.configure(api_key=GEMINI_API_KEY)
48
  tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
49
  runway_client = RunwayML(api_key=RUNWAY_KEY)
50
- eleven_client: Optional[ElevenLabs] = ElevenLabs(api_key=ELEVEN_KEY) if ELEVEN_AVAILABLE else None
 
 
51
 
52
  # ---------------- Constants ----------------
53
  DEFAULT_SCENES = 4
54
  MAX_SCENES = 8
55
- WORDS_PER_SEC = 2.5 # heuristic for mock silent track
56
- ALLOWED_DURATIONS = {5, 10}
57
  PLACEHOLDER_BG = (18, 18, 22)
58
  PLACEHOLDER_FG = (239, 239, 245)
59
  FONT_CANDIDATES = [
60
  "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
61
  "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
62
  ]
63
- GLOBAL_STYLE = (
64
- "Cinematic, natural volumetric light, subtle camera motion, high coherence, 4k texture detail"
65
- )
66
 
67
- # ---------------- Utility Functions ----------------
68
  def uid() -> str:
69
  return f"{int(time.time())}_{random.randint(1000, 9999)}"
70
 
71
  def sanitize_filename(name: str) -> str:
72
- safe = "".join(c for c in name if c.isalnum() or c in ("-", "_"))[:64]
73
  return safe or "video"
74
 
75
  def generate_placeholder_image(topic: str, width: int = 768, height: int = 432) -> str:
 
76
  img = Image.new("RGB", (width, height), PLACEHOLDER_BG)
77
  draw = ImageDraw.Draw(img)
78
  font = None
79
  for path in FONT_CANDIDATES:
80
  if Path(path).exists():
81
  try:
82
- font = ImageFont.truetype(path, 44)
83
  break
84
  except Exception:
85
  pass
86
  if font is None:
87
  font = ImageFont.load_default()
88
 
89
- words = topic.split()
90
- lines = []
91
- cur = []
92
- max_chars = 22
93
- for w in words:
94
- test = " ".join(cur + [w])
95
  if len(test) > max_chars:
96
- lines.append(" ".join(cur))
97
- cur = [w]
98
  else:
99
- cur.append(w)
100
- if cur:
101
- lines.append(" ".join(cur))
102
 
103
  total_h = 0
104
- for ln in lines:
 
105
  bbox = draw.textbbox((0, 0), ln, font=font)
106
- total_h += (bbox[3] - bbox[1]) + 8
 
 
107
  y = (height - total_h) // 2
108
- for ln in lines:
109
  bbox = draw.textbbox((0, 0), ln, font=font)
110
  w = bbox[2] - bbox[0]
111
  x = (width - w) // 2
112
  draw.text((x, y), ln, fill=PLACEHOLDER_FG, font=font)
113
- y += (bbox[3] - bbox[1]) + 8
114
 
115
  out_path = f"placeholder_{uid()}.png"
116
  img.save(out_path)
117
  return out_path
118
 
119
  def research_topic(topic: str) -> str:
 
120
  try:
121
  results = tavily_client.search(
122
  query=f"Key facts and interesting points about {topic}",
123
  search_depth="basic"
124
  )
125
  if results and "results" in results:
126
- return "
127
- ".join(
128
- str(r.get("content", "")).strip() for r in results["results"] if r.get("content")
 
129
  )
130
  except Exception as e:
131
  log.warning(f"Tavily failed: {e}")
132
  return "No supplemental research facts available."
133
 
134
  def gemini_script(topic: str, facts: str, scene_count: int) -> Dict[str, Any]:
 
135
  prompt = f"""
136
  You are a creative director for short-form educational / promotional videos.
137
 
@@ -140,117 +142,151 @@ Topic: {topic}
140
  Supplemental Facts:
141
  {facts}
142
 
143
- Produce STRICT JSON with keys:
144
- "narration_script": string
145
- "scene_prompts": list[{scene_count}] of cinematic prompts (<=40 words each), no numbering.
146
- Each scene prompt MUST specify a consistent main subject, camera/movement, and lighting/mood.
147
- JSON ONLY, no markdown fences.
 
 
 
 
 
 
 
148
  """
149
  model = genai.GenerativeModel("gemini-1.5-flash")
150
  response = model.generate_content(prompt)
151
  raw = (response.text or "").strip()
 
152
  if raw.startswith("```"):
153
- raw = raw.strip("`").lstrip("json").strip()
 
 
 
 
154
  data = None
155
  try:
156
  data = json.loads(raw)
157
  except json.JSONDecodeError:
158
- start, end = raw.find("{"), raw.rfind("}")
 
159
  if start != -1 and end != -1:
160
  try:
161
  data = json.loads(raw[start:end + 1])
162
  except Exception:
163
  pass
164
  if not isinstance(data, dict):
165
- raise gr.Error("Gemini did not return valid JSON structure.")
 
166
  narration = data.get("narration_script")
167
  scenes = data.get("scene_prompts")
 
168
  if isinstance(narration, list):
169
  narration = " ".join(map(str, narration))
170
  if not isinstance(narration, str) or not narration.strip():
171
  raise gr.Error("Invalid narration_script returned.")
172
  narration = narration.strip()
 
173
  if not isinstance(scenes, list):
174
- raise gr.Error("scene_prompts is not a list.")
175
  scenes = [str(s).strip() for s in scenes if str(s).strip()]
176
  if len(scenes) != scene_count:
 
177
  while len(scenes) < scene_count:
178
- scenes.append(scenes[-1] if scenes else f"Establishing cinematic shot about {topic}")
179
  scenes = scenes[:scene_count]
180
- return {"narration": narration, "scenes": scenes}
181
-
182
- def ensure_duration(narration: str) -> float:
183
- return max(2.0, min(300.0, len(narration.split()) / WORDS_PER_SEC))
184
-
185
- def mock_audio(narration: str, out_path: str) -> float:
186
- duration = ensure_duration(narration)
187
- subprocess.run([
188
- "ffmpeg", "-f", "lavfi", "-i", "anullsrc=r=44100:cl=mono",
189
- "-t", f"{duration:.2f}", "-q:a", "9", "-acodec", "libmp3lame", out_path, "-y"
190
- ], check=True)
191
- return duration
192
 
193
- def elevenlabs_tts(narration: str, voice_id: str, out_path: str, model: str, optimize_streaming_latency: int, use_stream: bool) -> float:
194
- if not ELEVEN_AVAILABLE:
195
- raise gr.Error("ElevenLabs API key not configured.")
196
- # Streaming or non-streaming generation
197
- if use_stream:
198
- # Streaming: write chunks as they arrive
199
- with open(out_path, "wb") as f:
200
- for chunk in eleven_client.text_to_speech.convert(
201
- voice_id=voice_id,
202
- optimize_streaming_latency=optimize_streaming_latency,
203
- model_id=model,
204
- output_format="mp3_44100_128",
205
- text=narration,
206
- voice_settings=VoiceSettings(
207
- stability=0.5,
208
- similarity_boost=0.8,
209
- style=0.3,
210
- use_speaker_boost=True,
211
- ),
212
- stream=True,
213
- ):
214
- if isinstance(chunk, bytes):
215
- f.write(chunk)
216
- else:
217
- audio = eleven_client.text_to_speech.convert(
218
- voice_id=voice_id,
219
- model_id=model,
220
- output_format="mp3_44100_128",
221
- text=narration,
222
- voice_settings=VoiceSettings(
223
- stability=0.5,
224
- similarity_boost=0.8,
225
- style=0.3,
226
- use_speaker_boost=True,
227
- ),
228
- )
229
- with open(out_path, "wb") as f:
230
- f.write(audio)
231
- # Roughly compute duration from word count; could probe with ffprobe for exact.
232
- return ensure_duration(narration)
233
 
 
234
  def list_elevenlabs_voices() -> List[Dict[str, str]]:
235
- if not ELEVEN_AVAILABLE:
 
236
  return []
237
  try:
 
238
  voices = eleven_client.voices.get_all()
239
- out = []
 
240
  for v in voices.voices:
241
- out.append({"id": v.voice_id, "name": v.name})
242
- return out
243
  except Exception as e:
244
- log.warning(f"Failed to list voices: {e}")
245
  return []
246
 
247
- def build_prompt_image_data_uri(image_path: str) -> str:
248
- import base64
249
- with open(image_path, "rb") as f:
250
- b64 = base64.b64encode(f.read()).decode("utf-8")
251
- return f"data:image/png;base64,{b64}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
 
253
  def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, ratio: str) -> str:
 
254
  try:
255
  task = runway_client.image_to_video.create(
256
  model="gen4_turbo",
@@ -282,9 +318,8 @@ def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, rat
282
  raise gr.Error("Runway returned no outputs.")
283
  video_url = outputs[0]
284
 
285
- import httpx
286
  clip_path = f"runway_clip_{uid()}.mp4"
287
- with httpx.stream("GET", video_url, timeout=120) as resp:
288
  resp.raise_for_status()
289
  with open(clip_path, "wb") as f:
290
  for chunk in resp.iter_bytes():
@@ -295,40 +330,41 @@ def concat_and_mux(video_paths: List[str], audio_path: str, out_path: str) -> No
295
  list_file = f"concat_{uid()}.txt"
296
  with open(list_file, "w") as lf:
297
  for p in video_paths:
298
- lf.write(f"file '{p}'
299
- ")
300
  temp_concat = f"combined_{uid()}.mp4"
301
  subprocess.run([
302
- "ffmpeg", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", temp_concat, "-y"
 
303
  ], check=True)
304
  subprocess.run([
305
- "ffmpeg", "-i", temp_concat, "-i", audio_path, "-c:v", "copy", "-c:a", "aac", "-shortest", out_path, "-y"
 
306
  ], check=True)
307
  for p in (list_file, temp_concat):
308
- try:
309
- os.remove(p)
310
- except OSError:
311
- pass
312
 
313
- def enhance_scene_prompt(base: str) -> str:
314
- return f"{base}. {GLOBAL_STYLE}"
315
 
316
- # ---------------- Main Generation Function ----------------
317
  def generate_video_from_topic(
318
  topic: str,
319
- keyframe_image: Optional[str],
320
  scene_count: int,
321
  clip_duration: int,
322
  ratio: str,
323
- use_eleven: bool,
324
- eleven_voice: str,
325
- eleven_model: str,
326
- streaming: bool,
327
- optimize_latency: int,
 
 
328
  progress=gr.Progress(track_tqdm=True)
329
  ) -> str:
330
  job = uid()
331
- log.info(f"[AI-STUDIO] Starting job {job} :: topic='{topic}'")
332
  temp_files: List[str] = []
333
  try:
334
  if not topic or not topic.strip():
@@ -345,65 +381,79 @@ def generate_video_from_topic(
345
  narration = script["narration"]
346
  scenes = script["scenes"]
347
 
348
- progress(0.30, desc="πŸŽ™οΈ Generating narration audio...")
349
- audio_path = f"audio_{job}.mp3"
350
  temp_files.append(audio_path)
351
- if use_eleven and ELEVEN_AVAILABLE:
352
- elevenlabs_tts(
353
- narration=narration,
354
- voice_id=eleven_voice,
355
- out_path=audio_path,
356
- model=eleven_model,
357
- optimize_streaming_latency=optimize_latency,
358
- use_stream=streaming,
 
 
 
 
 
359
  )
360
- else:
361
- mock_audio(narration, audio_path)
362
 
363
- progress(0.40, desc="πŸ–ΌοΈ Preparing keyframe image...")
364
- if keyframe_image:
365
- prompt_image_path = keyframe_image
 
 
 
 
366
  else:
367
  prompt_image_path = generate_placeholder_image(topic)
368
  temp_files.append(prompt_image_path)
369
- prompt_image_data_uri = build_prompt_image_data_uri(prompt_image_path)
 
 
370
 
 
371
  video_clips: List[str] = []
 
372
  for idx, base_prompt in enumerate(scenes, start=1):
373
- progress(0.40 + (0.45 * idx / scene_count), desc=f"🎬 Generating scene {idx}/{scene_count}...")
374
- full_prompt = enhance_scene_prompt(base_prompt)
 
375
  try:
376
  clip_path = runway_generate_clip(
377
- prompt_image=prompt_image_data_uri,
378
  text_prompt=full_prompt,
379
  duration=clip_duration,
380
  ratio=ratio
381
  )
 
 
382
  except Exception as e:
383
- log.error(f"Scene {idx} failed: {e}; retrying once with refined prompt")
384
- retry_prompt = full_prompt + " -- refined detail, consistent style"
385
  clip_path = runway_generate_clip(
386
- prompt_image=prompt_image_data_uri,
387
  text_prompt=retry_prompt,
388
  duration=clip_duration,
389
  ratio=ratio
390
  )
391
- video_clips.append(clip_path)
392
- temp_files.append(clip_path)
393
 
394
  progress(0.92, desc="🧡 Stitching scenes...")
395
  final_out = f"{sanitize_filename(topic)}_{job}.mp4"
396
  concat_and_mux(video_clips, audio_path, final_out)
397
 
398
  progress(1.0, desc="βœ… Done!")
399
- log.info(f"[AI-STUDIO] Job {job} completed -> {final_out}")
400
  return final_out
401
 
402
  except Exception as e:
403
- log.error(f"[AI-STUDIO] JOB {job} FAILED: {e}", exc_info=True)
404
  raise gr.Error(f"An error occurred: {e}")
405
  finally:
406
- # Clean up intermediate (keep keyframe if user uploaded it)
407
  for p in temp_files:
408
  try:
409
  if os.path.exists(p):
@@ -411,75 +461,71 @@ def generate_video_from_topic(
411
  except OSError:
412
  pass
413
 
414
- # ---------------- Voice Helper for UI ----------------
415
- def get_voice_choices() -> List[str]:
416
  voices = list_elevenlabs_voices()
417
- if not voices:
418
- return ["eleven_monolingual_v1"] # fallback placeholder id name pattern
419
- return [f"{v['name']}|{v['id']}" for v in voices]
420
-
421
- VOICE_CHOICES = get_voice_choices()
422
- DEFAULT_VOICE = VOICE_CHOICES[0] if VOICE_CHOICES else "Rachel|21m00Tcm4TlvDq8ikWAM" # Example default voice id pattern
423
 
424
  # ---------------- Gradio UI ----------------
425
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
426
  gr.Markdown("# 🎬 AI Video Studio (Runway Gen-4 Turbo + Gemini + ElevenLabs)")
427
  gr.Markdown(
428
- "Generate a multi-scene AI video: research β†’ script β†’ voiceover (mock or ElevenLabs) β†’ Gen-4 Turbo clips β†’ stitch."
 
429
  )
430
 
431
  with gr.Row():
432
  topic = gr.Textbox(label="Video Topic", placeholder="e.g., The history of coffee", scale=3)
433
  keyframe = gr.Image(type="filepath", label="Optional Keyframe (Image)", scale=2)
434
 
435
- with gr.Accordion("Narration Settings (ElevenLabs)", open=False):
436
- use_eleven = gr.Checkbox(value=ELEVEN_AVAILABLE, label="Use ElevenLabs (falls back to mock if unchecked or unavailable)")
437
- voice_select = gr.Dropdown(choices=VOICE_CHOICES, value=DEFAULT_VOICE, label="Voice (Name|ID)")
438
- eleven_model = gr.Textbox(value="eleven_turbo_v2_5", label="ElevenLabs Model ID")
439
- streaming = gr.Checkbox(value=True, label="Stream TTS (lower latency)")
440
- optimize_latency = gr.Slider(0, 4, value=0, step=1, label="Optimize Streaming Latency (0=off, higher=more aggressive)")
441
-
442
  with gr.Row():
443
  scene_count = gr.Slider(1, MAX_SCENES, value=DEFAULT_SCENES, step=1, label="Number of Scenes")
444
  duration = gr.Radio(choices=sorted(list(ALLOWED_DURATIONS)), value=5, label="Seconds per Scene")
445
- ratio = gr.Dropdown(choices=["1280:720", "1920:1080", "1080:1920", "1024:1024"], value="1280:720", label="Aspect Ratio")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
 
447
  generate_btn = gr.Button("πŸš€ Generate Video", variant="primary")
448
  output_video = gr.Video(label="Final Video")
449
 
450
- def _parse_voice(v: str) -> str:
451
- if "|" in v:
452
- return v.split("|", 1)[1]
453
- return v
454
-
455
- def wrapper(topic, keyframe, scene_count, duration, ratio, use_eleven, voice_combo, eleven_model, streaming, optimize_latency):
456
- voice_id = _parse_voice(voice_combo)
457
- return generate_video_from_topic(
458
- topic=topic,
459
- keyframe_image=keyframe,
460
- scene_count=scene_count,
461
- clip_duration=int(duration),
462
- ratio=ratio,
463
- use_eleven=use_eleven,
464
- eleven_voice=voice_id,
465
- eleven_model=eleven_model.strip() or "eleven_turbo_v2_5",
466
- streaming=streaming,
467
- optimize_latency=int(optimize_latency),
468
- )
469
 
470
  generate_btn.click(
471
- fn=wrapper,
472
- inputs=[topic, keyframe, scene_count, duration, ratio, use_eleven, voice_select, eleven_model, streaming, optimize_latency],
 
 
 
 
473
  outputs=output_video
474
  )
475
 
476
- gr.Markdown("""---
477
- ### Tips
478
- - Upload a keyframe to increase subject continuity.
479
- - Refine prompts by editing the generated scene prompts logic (extend code for manual review step).
480
- - ElevenLabs: if you get 401 errors, verify the API key and voice ID. For new voices, refresh the Space (reload) to repopulate the list.
481
- - Use 5s scenes for faster iteration; switch to 10s for final renders.
482
- """)
483
 
484
  if __name__ == "__main__":
485
  demo.launch()
 
12
  from PIL import Image, ImageDraw, ImageFont
13
 
14
  # External SDKs
15
+ import google.generativeai as genai # Gemini
16
+ from tavily import TavilyClient # Research enrichment
17
+ from runwayml import RunwayML, TaskFailedError # Runway SDK
18
+ from elevenlabs import ElevenLabs, APIError # ElevenLabs TTS (pip install elevenlabs)
19
+ import httpx
20
+ import base64
21
+
22
+ # ---------------- Logging ----------------
23
  logging.basicConfig(
24
  level=logging.INFO,
25
  format="[%(levelname)s %(asctime)s] %(message)s",
 
27
  )
28
  log = logging.getLogger("ai_video_studio")
29
 
30
+ # ---------------- Configuration / Keys ----------------
31
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
32
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
33
  RUNWAY_KEY = os.getenv("RUNWAY_API_KEY") or os.getenv("RUNWAYML_API_SECRET")
34
+ ELEVEN_KEY = os.getenv("ELEVENLABS_API_KEY") or os.getenv("XI_API_KEY")
35
 
36
+ missing = [k for k, v in {
37
  "GEMINI_API_KEY": GEMINI_API_KEY,
38
  "TAVILY_API_KEY": TAVILY_API_KEY,
39
+ "RUNWAY_API_KEY": RUNWAY_KEY
40
+ }.items() if not v]
 
41
  if missing:
42
  raise RuntimeError(f"Missing required API keys: {', '.join(missing)}")
43
 
 
 
 
 
44
  genai.configure(api_key=GEMINI_API_KEY)
45
  tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
46
  runway_client = RunwayML(api_key=RUNWAY_KEY)
47
+ eleven_client: Optional[ElevenLabs] = None
48
+ if ELEVEN_KEY:
49
+ eleven_client = ElevenLabs(api_key=ELEVEN_KEY)
50
 
51
  # ---------------- Constants ----------------
52
  DEFAULT_SCENES = 4
53
  MAX_SCENES = 8
54
+ ALLOWED_DURATIONS = {5, 10} # Runway Gen-4 supported lengths (seconds)
55
+ WORDS_PER_SEC = 2.5 # Heuristic for mock track
56
  PLACEHOLDER_BG = (18, 18, 22)
57
  PLACEHOLDER_FG = (239, 239, 245)
58
  FONT_CANDIDATES = [
59
  "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
60
  "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
61
  ]
 
 
 
62
 
63
+ # ---------------- Utility ----------------
64
  def uid() -> str:
65
  return f"{int(time.time())}_{random.randint(1000, 9999)}"
66
 
67
  def sanitize_filename(name: str) -> str:
68
+ safe = "".join(c for c in name if c.isalnum() or c in ("-", "_"))[:60]
69
  return safe or "video"
70
 
71
  def generate_placeholder_image(topic: str, width: int = 768, height: int = 432) -> str:
72
+ """Create a simple PNG keyframe if user didn't upload one."""
73
  img = Image.new("RGB", (width, height), PLACEHOLDER_BG)
74
  draw = ImageDraw.Draw(img)
75
  font = None
76
  for path in FONT_CANDIDATES:
77
  if Path(path).exists():
78
  try:
79
+ font = ImageFont.truetype(path, 42)
80
  break
81
  except Exception:
82
  pass
83
  if font is None:
84
  font = ImageFont.load_default()
85
 
86
+ max_chars = 24
87
+ wrapped: List[str] = []
88
+ line: List[str] = []
89
+ for w in topic.split():
90
+ test = " ".join(line + [w])
 
91
  if len(test) > max_chars:
92
+ wrapped.append(" ".join(line))
93
+ line = [w]
94
  else:
95
+ line.append(w)
96
+ if line:
97
+ wrapped.append(" ".join(line))
98
 
99
  total_h = 0
100
+ line_metrics = []
101
+ for ln in wrapped:
102
  bbox = draw.textbbox((0, 0), ln, font=font)
103
+ h = bbox[3] - bbox[1]
104
+ line_metrics.append((ln, h))
105
+ total_h += h + 10
106
  y = (height - total_h) // 2
107
+ for ln, h in line_metrics:
108
  bbox = draw.textbbox((0, 0), ln, font=font)
109
  w = bbox[2] - bbox[0]
110
  x = (width - w) // 2
111
  draw.text((x, y), ln, fill=PLACEHOLDER_FG, font=font)
112
+ y += h + 10
113
 
114
  out_path = f"placeholder_{uid()}.png"
115
  img.save(out_path)
116
  return out_path
117
 
118
  def research_topic(topic: str) -> str:
119
+ """Fetch supplemental facts; return safe fallback if API fails."""
120
  try:
121
  results = tavily_client.search(
122
  query=f"Key facts and interesting points about {topic}",
123
  search_depth="basic"
124
  )
125
  if results and "results" in results:
126
+ return "\n".join(
127
+ str(r.get("content", "")).strip()
128
+ for r in results["results"]
129
+ if r.get("content")
130
  )
131
  except Exception as e:
132
  log.warning(f"Tavily failed: {e}")
133
  return "No supplemental research facts available."
134
 
135
  def gemini_script(topic: str, facts: str, scene_count: int) -> Dict[str, Any]:
136
+ """Obtain narration + scene prompts as structured JSON from Gemini."""
137
  prompt = f"""
138
  You are a creative director for short-form educational / promotional videos.
139
 
 
142
  Supplemental Facts:
143
  {facts}
144
 
145
+ Return STRICT JSON:
146
+ {{
147
+ "narration_script": "<single cohesive narration>",
148
+ "scene_prompts": ["<scene 1>", ... (exactly {scene_count} total) ]
149
+ }}
150
+
151
+ Scene prompt requirements:
152
+ - <= 40 words
153
+ - Consistent main subject
154
+ - Include camera/movement term (e.g. "slow dolly in", "aerial sweep")
155
+ - Mention lighting/mood
156
+ NO markdown, NO extra commentary.
157
  """
158
  model = genai.GenerativeModel("gemini-1.5-flash")
159
  response = model.generate_content(prompt)
160
  raw = (response.text or "").strip()
161
+
162
  if raw.startswith("```"):
163
+ # strip code fences if present
164
+ raw = raw.strip("`")
165
+ if raw.lower().startswith("json"):
166
+ raw = raw[4:].strip()
167
+
168
  data = None
169
  try:
170
  data = json.loads(raw)
171
  except json.JSONDecodeError:
172
+ start = raw.find("{")
173
+ end = raw.rfind("}")
174
  if start != -1 and end != -1:
175
  try:
176
  data = json.loads(raw[start:end + 1])
177
  except Exception:
178
  pass
179
  if not isinstance(data, dict):
180
+ raise gr.Error("Gemini did not return valid JSON.")
181
+
182
  narration = data.get("narration_script")
183
  scenes = data.get("scene_prompts")
184
+
185
  if isinstance(narration, list):
186
  narration = " ".join(map(str, narration))
187
  if not isinstance(narration, str) or not narration.strip():
188
  raise gr.Error("Invalid narration_script returned.")
189
  narration = narration.strip()
190
+
191
  if not isinstance(scenes, list):
192
+ raise gr.Error("scene_prompts missing or not a list.")
193
  scenes = [str(s).strip() for s in scenes if str(s).strip()]
194
  if len(scenes) != scene_count:
195
+ # normalize length
196
  while len(scenes) < scene_count:
197
+ scenes.append(f"Dynamic cinematic shot about {topic}")
198
  scenes = scenes[:scene_count]
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
+ return {"narration": narration, "scenes": scenes}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
+ # ---------------- ElevenLabs Integration ----------------
203
  def list_elevenlabs_voices() -> List[Dict[str, str]]:
204
+ """Fetch voices (name + id) if ElevenLabs key available."""
205
+ if not eleven_client:
206
  return []
207
  try:
208
+ # The SDK's voices list method (internally hits the list voices endpoint)
209
  voices = eleven_client.voices.get_all()
210
+ # Normalize to simple dict
211
+ simplified = []
212
  for v in voices.voices:
213
+ simplified.append({"id": v.voice_id, "name": v.name})
214
+ return simplified
215
  except Exception as e:
216
+ log.warning(f"Failed to list ElevenLabs voices: {e}")
217
  return []
218
 
219
+ def synthesize_narration_elevenlabs(
220
+ text: str,
221
+ voice_id: str,
222
+ model_id: str,
223
+ stability: float,
224
+ similarity: float,
225
+ style: float,
226
+ speaker_boost: bool,
227
+ streaming: bool,
228
+ out_path: str
229
+ ) -> bool:
230
+ """Return True on success; False triggers fallback."""
231
+ if not eleven_client:
232
+ return False
233
+ try:
234
+ # Bound parameters
235
+ stability = max(0.0, min(1.0, stability))
236
+ similarity = max(0.0, min(1.0, similarity))
237
+ style = max(0.0, min(1.0, style))
238
+
239
+ if streaming:
240
+ # Streaming synthesis (chunked)
241
+ with open(out_path, "wb") as f:
242
+ for chunk in eleven_client.text_to_speech.convert_as_stream(
243
+ voice_id=voice_id,
244
+ model_id=model_id,
245
+ text=text,
246
+ optimize_streaming_latency=3,
247
+ voice_settings={
248
+ "stability": stability,
249
+ "similarity_boost": similarity,
250
+ "style": style,
251
+ "use_speaker_boost": speaker_boost
252
+ }
253
+ ):
254
+ f.write(chunk)
255
+ else:
256
+ # Standard synthesis (single request)
257
+ audio = eleven_client.text_to_speech.convert(
258
+ voice_id=voice_id,
259
+ model_id=model_id,
260
+ text=text,
261
+ voice_settings={
262
+ "stability": stability,
263
+ "similarity_boost": similarity,
264
+ "style": style,
265
+ "use_speaker_boost": speaker_boost
266
+ }
267
+ )
268
+ with open(out_path, "wb") as f:
269
+ f.write(audio)
270
+ return True
271
+ except APIError as e:
272
+ log.error(f"ElevenLabs API error: {e}")
273
+ except Exception as e:
274
+ log.error(f"ElevenLabs synthesis failed: {e}")
275
+ return False
276
+
277
+ def generate_mock_voiceover(narration: str, out_path: str) -> float:
278
+ """Silent track matching approximate narration length (fallback)."""
279
+ duration = max(2.0, min(300.0, len(narration.split()) / WORDS_PER_SEC))
280
+ subprocess.run([
281
+ "ffmpeg", "-f", "lavfi", "-i", "anullsrc=r=44100:cl=mono",
282
+ "-t", f"{duration:.2f}", "-q:a", "9", "-acodec", "libmp3lame",
283
+ out_path, "-y"
284
+ ], check=True)
285
+ return duration
286
 
287
+ # ---------------- Runway Integration ----------------
288
  def runway_generate_clip(prompt_image: str, text_prompt: str, duration: int, ratio: str) -> str:
289
+ """Create image_to_video task and download resulting MP4."""
290
  try:
291
  task = runway_client.image_to_video.create(
292
  model="gen4_turbo",
 
318
  raise gr.Error("Runway returned no outputs.")
319
  video_url = outputs[0]
320
 
 
321
  clip_path = f"runway_clip_{uid()}.mp4"
322
+ with httpx.stream("GET", video_url, timeout=180) as resp:
323
  resp.raise_for_status()
324
  with open(clip_path, "wb") as f:
325
  for chunk in resp.iter_bytes():
 
330
  list_file = f"concat_{uid()}.txt"
331
  with open(list_file, "w") as lf:
332
  for p in video_paths:
333
+ lf.write(f"file '{p}'\n")
 
334
  temp_concat = f"combined_{uid()}.mp4"
335
  subprocess.run([
336
+ "ffmpeg", "-f", "concat", "-safe", "0", "-i", list_file,
337
+ "-c", "copy", temp_concat, "-y"
338
  ], check=True)
339
  subprocess.run([
340
+ "ffmpeg", "-i", temp_concat, "-i", audio_path,
341
+ "-c:v", "copy", "-c:a", "aac", "-shortest", out_path, "-y"
342
  ], check=True)
343
  for p in (list_file, temp_concat):
344
+ try: os.remove(p)
345
+ except OSError: pass
 
 
346
 
347
+ def enhance_scene_prompt(base: str, global_style: str) -> str:
348
+ return f"{base}. {global_style}"
349
 
350
+ # ---------------- Core Generation ----------------
351
  def generate_video_from_topic(
352
  topic: str,
353
+ uploaded_keyframe: Optional[str],
354
  scene_count: int,
355
  clip_duration: int,
356
  ratio: str,
357
+ voice_id: str,
358
+ model_id: str,
359
+ stability: float,
360
+ similarity: float,
361
+ style: float,
362
+ speaker_boost: bool,
363
+ use_streaming_tts: bool,
364
  progress=gr.Progress(track_tqdm=True)
365
  ) -> str:
366
  job = uid()
367
+ log.info(f"[AI-STUDIO] Start job {job} topic='{topic}'")
368
  temp_files: List[str] = []
369
  try:
370
  if not topic or not topic.strip():
 
381
  narration = script["narration"]
382
  scenes = script["scenes"]
383
 
384
+ progress(0.30, desc="πŸŽ™οΈ Generating narration...")
385
+ audio_path = f"narration_{job}.mp3"
386
  temp_files.append(audio_path)
387
+
388
+ tts_success = False
389
+ if ELEVEN_KEY and voice_id and model_id:
390
+ tts_success = synthesize_narration_elevenlabs(
391
+ text=narration,
392
+ voice_id=voice_id,
393
+ model_id=model_id,
394
+ stability=stability,
395
+ similarity=similarity,
396
+ style=style,
397
+ speaker_boost=speaker_boost,
398
+ streaming=use_streaming_tts,
399
+ out_path=audio_path
400
  )
 
 
401
 
402
+ if not tts_success:
403
+ log.warning("Using mock silent track (ElevenLabs unavailable or failed).")
404
+ generate_mock_voiceover(narration, audio_path)
405
+
406
+ progress(0.40, desc="πŸ–ΌοΈ Preparing keyframe...")
407
+ if uploaded_keyframe:
408
+ prompt_image_path = uploaded_keyframe
409
  else:
410
  prompt_image_path = generate_placeholder_image(topic)
411
  temp_files.append(prompt_image_path)
412
+ with open(prompt_image_path, "rb") as f:
413
+ b64 = base64.b64encode(f.read()).decode("utf-8")
414
+ prompt_image = f"data:image/png;base64,{b64}"
415
 
416
+ global_style = "Cinematic, natural volumetric light, subtle camera motion, cohesive style, high detail"
417
  video_clips: List[str] = []
418
+
419
  for idx, base_prompt in enumerate(scenes, start=1):
420
+ progress(0.40 + 0.45 * idx / scene_count,
421
+ desc=f"🎬 Generating scene {idx}/{scene_count}...")
422
+ full_prompt = enhance_scene_prompt(base_prompt, global_style)
423
  try:
424
  clip_path = runway_generate_clip(
425
+ prompt_image=prompt_image,
426
  text_prompt=full_prompt,
427
  duration=clip_duration,
428
  ratio=ratio
429
  )
430
+ video_clips.append(clip_path)
431
+ temp_files.append(clip_path)
432
  except Exception as e:
433
+ log.error(f"Scene {idx} failed: {e}")
434
+ retry_prompt = full_prompt + " -- consistent subject, refined detail"
435
  clip_path = runway_generate_clip(
436
+ prompt_image=prompt_image,
437
  text_prompt=retry_prompt,
438
  duration=clip_duration,
439
  ratio=ratio
440
  )
441
+ video_clips.append(clip_path)
442
+ temp_files.append(clip_path)
443
 
444
  progress(0.92, desc="🧡 Stitching scenes...")
445
  final_out = f"{sanitize_filename(topic)}_{job}.mp4"
446
  concat_and_mux(video_clips, audio_path, final_out)
447
 
448
  progress(1.0, desc="βœ… Done!")
449
+ log.info(f"[AI-STUDIO] Job {job} complete -> {final_out}")
450
  return final_out
451
 
452
  except Exception as e:
453
+ log.error(f"[AI-STUDIO] Job {job} FAILED: {e}", exc_info=True)
454
  raise gr.Error(f"An error occurred: {e}")
455
  finally:
456
+ # Clean temp artifacts (not final video)
457
  for p in temp_files:
458
  try:
459
  if os.path.exists(p):
 
461
  except OSError:
462
  pass
463
 
464
+ # ---------------- Helper for Voice Dropdown ----------------
465
+ def refresh_voices() -> List[str]:
466
  voices = list_elevenlabs_voices()
467
+ return [f"{v['name']}|{v['id']}" for v in voices] if voices else []
 
 
 
 
 
468
 
469
  # ---------------- Gradio UI ----------------
470
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
471
  gr.Markdown("# 🎬 AI Video Studio (Runway Gen-4 Turbo + Gemini + ElevenLabs)")
472
  gr.Markdown(
473
+ "Provide a topic (and optional keyframe). We’ll research, script, generate multi-scene video, "
474
+ "synthesize narration, and assemble the final clip."
475
  )
476
 
477
  with gr.Row():
478
  topic = gr.Textbox(label="Video Topic", placeholder="e.g., The history of coffee", scale=3)
479
  keyframe = gr.Image(type="filepath", label="Optional Keyframe (Image)", scale=2)
480
 
 
 
 
 
 
 
 
481
  with gr.Row():
482
  scene_count = gr.Slider(1, MAX_SCENES, value=DEFAULT_SCENES, step=1, label="Number of Scenes")
483
  duration = gr.Radio(choices=sorted(list(ALLOWED_DURATIONS)), value=5, label="Seconds per Scene")
484
+ ratio = gr.Dropdown(choices=["1280:720", "1920:1080", "1080:1920", "1024:1024"],
485
+ value="1280:720", label="Aspect Ratio")
486
+
487
+ gr.Markdown("### Narration (ElevenLabs)")
488
+ with gr.Row():
489
+ refresh_btn = gr.Button("πŸ”„ Refresh Voices", variant="secondary")
490
+ voices_dd = gr.Dropdown(choices=[], label="Voice (Name|ID)", value=None)
491
+ model_dd = gr.Dropdown(
492
+ choices=[
493
+ "eleven_multilingual_v2", "eleven_turbo_v2_5",
494
+ "eleven_flash_v2_5", "eleven_monolingual_v1"
495
+ ],
496
+ value="eleven_turbo_v2_5",
497
+ label="ElevenLabs Model"
498
+ )
499
+ streaming_chk = gr.Checkbox(label="Streaming TTS", value=False)
500
+
501
+ with gr.Row():
502
+ stability = gr.Slider(0, 1, value=0.55, step=0.01, label="Stability")
503
+ similarity = gr.Slider(0, 1, value=0.80, step=0.01, label="Similarity Boost")
504
+ style = gr.Slider(0, 1, value=0.20, step=0.01, label="Style")
505
+ speaker_boost = gr.Checkbox(label="Speaker Boost", value=True)
506
 
507
  generate_btn = gr.Button("πŸš€ Generate Video", variant="primary")
508
  output_video = gr.Video(label="Final Video")
509
 
510
+ def _do_refresh():
511
+ return gr.update(choices=refresh_voices())
512
+
513
+ refresh_btn.click(fn=_do_refresh, outputs=voices_dd)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
 
515
  generate_btn.click(
516
+ fn=generate_video_from_topic,
517
+ inputs=[
518
+ topic, keyframe, scene_count, duration, ratio,
519
+ voices_dd, model_dd, stability, similarity, style,
520
+ speaker_boost, streaming_chk
521
+ ],
522
  outputs=output_video
523
  )
524
 
525
+ gr.Markdown("### Tips\n"
526
+ "- Provide a strong keyframe for better temporal coherence.\n"
527
+ "- Refine scene prompts by adjusting topic wording if motion feels generic.\n"
528
+ "- Tweak Stability and Similarity to balance expressiveness vs consistency.")
 
 
 
529
 
530
  if __name__ == "__main__":
531
  demo.launch()