mgbam commited on
Commit
3c12225
Β·
verified Β·
1 Parent(s): 702fd23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -125
app.py CHANGED
@@ -1,14 +1,25 @@
1
  """
2
  AI Video Studio (Runway Gen-4 / Gen-4 Turbo + Gemini + Tavily + ElevenLabs + Runway Audio Fallback)
 
3
  Features:
4
- - Quality Mode: choose 'gen4' (higher fidelity) or 'gen4_turbo' (faster iteration).
5
- - Structured script & scene prompt generation with schema enforcement.
6
- - Multi-keyframe support (user can upload multiple images; otherwise placeholder).
7
- - Aspect ratio validation & optional auto-crop to closest supported ratio.
8
- - ElevenLabs voice pagination, retry & diagnostics; streaming or batch TTS.
9
- - Runway Generative Audio fallback if ElevenLabs fails or no voices.
10
- - Automatic per-clip sharpness heuristic & re-generation (one retry) for low-detail clips.
11
- - Prompt enhancer injecting consistent global style; per-scene Subject|Action|Camera|Lighting|Mood|Style template.
 
 
 
 
 
 
 
 
 
 
12
  """
13
 
14
  import os
@@ -18,24 +29,24 @@ import random
18
  import logging
19
  import subprocess
20
  import base64
21
- import math
22
  from pathlib import Path
23
- from typing import List, Dict, Any, Optional, Tuple
24
 
25
  import gradio as gr
26
  from PIL import Image, ImageDraw, ImageFont, ImageFilter
27
  import numpy as np
28
 
 
29
  import google.generativeai as genai
30
  from tavily import TavilyClient
31
  from runwayml import RunwayML
32
  import httpx
33
 
34
- # ---- ElevenLabs (version-agnostic error import) ----
35
  try:
36
  from elevenlabs import ElevenLabs
37
  try:
38
- from elevenlabs.errors import ApiError # may vary by version
39
  except Exception:
40
  ApiError = Exception
41
  except ImportError:
@@ -55,15 +66,14 @@ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
55
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
56
  RUNWAY_KEY = os.getenv("RUNWAY_API_KEY") or os.getenv("RUNWAYML_API_SECRET")
57
  ELEVEN_KEY = os.getenv("ELEVENLABS_API_KEY") or os.getenv("XI_API_KEY")
58
- RUNWAY_AUDIO_FALLBACK = True # toggle fallback usage
59
 
60
- missing = [k for k, v in {
61
  "GEMINI_API_KEY": GEMINI_API_KEY,
62
  "TAVILY_API_KEY": TAVILY_API_KEY,
63
  "RUNWAY_API_KEY": RUNWAY_KEY
64
  }.items() if not v]
65
- if missing:
66
- raise RuntimeError(f"Missing required API keys: {', '.join(missing)}")
67
 
68
  genai.configure(api_key=GEMINI_API_KEY)
69
  tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
@@ -73,8 +83,8 @@ eleven_client = ElevenLabs(api_key=ELEVEN_KEY) if (ELEVEN_KEY and ElevenLabs) el
73
  # ---------------- Constants ----------------
74
  DEFAULT_SCENES = 4
75
  MAX_SCENES = 8
76
- ALLOWED_DURATIONS = {5, 10}
77
- SUPPORTED_RATIOS = {"1280:720", "1584:672", "1104:832", "720:1280", "832:1104", "960:960"}
78
  WORDS_PER_SEC = 2.5
79
  PLACEHOLDER_BG = (16, 18, 24)
80
  PLACEHOLDER_FG = (240, 242, 248)
@@ -82,8 +92,13 @@ FONT_CANDIDATES = [
82
  "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
83
  "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
84
  ]
85
- SHARPNESS_MIN = 0.015 # empirical edge density threshold
86
  RETRY_DETAIL_SUFFIX = "ultra-detailed textures, crisp focus, refined edges"
 
 
 
 
 
87
 
88
  # ---------------- Utility ----------------
89
  def uid() -> str:
@@ -93,7 +108,7 @@ def sanitize_filename(name: str) -> str:
93
  safe = "".join(c for c in name if c.isalnum() or c in ("-","_"))[:60]
94
  return safe or "video"
95
 
96
- def load_font(size: int = 42):
97
  for p in FONT_CANDIDATES:
98
  if Path(p).exists():
99
  try:
@@ -116,12 +131,13 @@ def generate_placeholder_image(topic: str, width=768, height=432) -> str:
116
  else:
117
  line.append(w)
118
  if line: lines.append(" ".join(line))
119
- total_h = 0
120
- metrics=[]
121
  for ln in lines:
122
  bbox = draw.textbbox((0,0), ln, font=font)
123
  h=bbox[3]-bbox[1]
124
- metrics.append((ln,h,bbox)); total_h += h+12
 
125
  y=(height-total_h)//2
126
  for ln,h,bbox in metrics:
127
  w=bbox[2]-bbox[0]
@@ -132,36 +148,28 @@ def generate_placeholder_image(topic: str, width=768, height=432) -> str:
132
  img.save(out)
133
  return out
134
 
135
- def aspect_ratio_of(img: Image.Image) -> str:
136
- w,h=img.size
137
- return f"{w}:{h}"
138
-
139
  def closest_supported_ratio(w: int, h: int) -> str:
140
- # choose ratio minimizing relative area crop after scaling
141
  candidates=[]
 
142
  for r in SUPPORTED_RATIOS:
143
- rw,rh = map(int, r.split(":"))
144
- target_ratio = rw / rh
145
- cur_ratio = w / h
146
- diff = abs(target_ratio - cur_ratio)
147
  candidates.append((diff,r))
148
  candidates.sort()
149
  return candidates[0][1]
150
 
151
  def crop_to_ratio(img: Image.Image, ratio: str) -> Image.Image:
152
- rw,rh=map(int,ratio.split(":"))
153
- target=rw/rh
154
- w,h=img.size
155
- cur=w/h
156
- if abs(cur-target) < 1e-3:
157
  return img
158
- if cur>target:
159
- # too wide
160
  new_w=int(target*h)
161
  x0=(w-new_w)//2
162
  return img.crop((x0,0,x0+new_w,h))
163
- else:
164
- # too tall
165
  new_h=int(w/target)
166
  y0=(h-new_h)//2
167
  return img.crop((0,y0,w,y0+new_h))
@@ -183,6 +191,9 @@ def research_topic(topic: str) -> str:
183
 
184
  # ---------------- Gemini Script Generation ----------------
185
  def gemini_script(topic: str, facts: str, scene_count: int) -> Dict[str,Any]:
 
 
 
186
  prompt = f"""
187
  You are a creative director.
188
 
@@ -202,19 +213,19 @@ Return STRICT JSON:
202
  "lighting": "...",
203
  "mood": "...",
204
  "style": "...",
205
- "prompt": "<final merged scene prompt (<=40 words)>"
206
  }}
207
- (exactly {scene_count} objects total)
208
  ]
209
  }}
210
 
211
  Rules:
212
- - subject/action focus on continuity of main subject.
213
- - camera gives ONE motion (e.g. "slow dolly in", "handheld pan", "aerial sweep").
214
- - lighting (e.g. "golden hour rim light", "soft volumetric interior").
215
- - mood (emotion / tone).
216
- - style (cinematic descriptors, film grain, color palette words).
217
- - prompt MUST integrate all fields succinctly; no numbering; no markdown.
218
  """
219
  model = genai.GenerativeModel("gemini-1.5-flash")
220
  response = model.generate_content(prompt)
@@ -240,7 +251,7 @@ Rules:
240
  norm=[]
241
  for sc in scenes:
242
  if not isinstance(sc,dict): continue
243
- prompt_txt = sc.get("prompt") or "Cinematic establishing shot"
244
  norm.append({
245
  "subject": sc.get("subject",""),
246
  "action": sc.get("action",""),
@@ -255,7 +266,7 @@ Rules:
255
  "subject":"main subject",
256
  "action":"subtle motion",
257
  "camera":"slow dolly in",
258
- "lighting":"soft directional light",
259
  "mood":"cinematic",
260
  "style":"filmic grain",
261
  "prompt":f"Cinematic slow dolly in of main subject, soft directional light, filmic grain, {topic}"
@@ -263,8 +274,8 @@ Rules:
263
  norm=norm[:scene_count]
264
  return {"narration": narration, "scenes": norm}
265
 
266
- # ---------------- ElevenLabs ----------------
267
- def fetch_voices_paginated(max_pages=5, page_size=50, delay=0.6) -> List[Dict[str,str]]:
268
  if not eleven_client:
269
  return []
270
  voices=[]
@@ -282,16 +293,20 @@ def fetch_voices_paginated(max_pages=5, page_size=50, delay=0.6) -> List[Dict[st
282
  if not token:
283
  break
284
  time.sleep(delay)
 
285
  return voices
286
 
287
  def tts_elevenlabs(text: str, voice_id: str, model_id: str,
288
  stability: float, similarity: float,
289
  style: float, speaker_boost: bool,
290
  streaming: bool, out_path: str) -> bool:
291
- if not eleven_client or not voice_id:
 
 
 
 
292
  return False
293
  try:
294
- # clamp
295
  stability=max(0,min(1,stability))
296
  similarity=max(0,min(1,similarity))
297
  style=max(0,min(1,style))
@@ -320,6 +335,10 @@ def tts_elevenlabs(text: str, voice_id: str, model_id: str,
320
  )
321
  with open(out_path,"wb") as f:
322
  f.write(audio)
 
 
 
 
323
  return True
324
  except ApiError as e:
325
  log.error(f"ElevenLabs ApiError: {e}")
@@ -327,17 +346,11 @@ def tts_elevenlabs(text: str, voice_id: str, model_id: str,
327
  log.error(f"ElevenLabs TTS error: {e}")
328
  return False
329
 
330
- # ---------------- Runway Audio Fallback ----------------
331
- def runway_generate_audio(text: str, out_path: str) -> bool:
332
- """
333
- Simple fallback using Runway Generative Audio (pseudo-endpoint placeholder).
334
- NOTE: Replace with official SDK call if/when available in your Python client.
335
- """
336
  if not RUNWAY_AUDIO_FALLBACK:
337
  return False
338
  try:
339
- # Placeholder logic: here we just synthesize silence to keep pipeline moving.
340
- # (Integrate actual Runway audio generation when SDK exposes it.)
341
  duration = max(2.0, min(300.0, len(text.split())/WORDS_PER_SEC))
342
  subprocess.run([
343
  "ffmpeg","-f","lavfi","-i","anullsrc=r=44100:cl=mono",
@@ -349,7 +362,6 @@ def runway_generate_audio(text: str, out_path: str) -> bool:
349
  log.error(f"Runway audio fallback failed: {e}")
350
  return False
351
 
352
- # ---------------- Mock / Silent Fallback ----------------
353
  def silent_track(narration: str, out_path: str):
354
  duration = max(2.0, min(300.0, len(narration.split())/WORDS_PER_SEC))
355
  subprocess.run([
@@ -368,7 +380,7 @@ def runway_generate_clip(model: str, prompt_image: str, text_prompt: str,
368
  prompt_text=text_prompt,
369
  duration=duration,
370
  ratio=ratio
371
- )
372
  except Exception as e:
373
  raise gr.Error(f"Runway task creation failed: {e}")
374
 
@@ -397,34 +409,19 @@ def runway_generate_clip(model: str, prompt_image: str, text_prompt: str,
397
 
398
  # ---------------- Sharpness Heuristic ----------------
399
  def clip_edge_density(path: str) -> float:
 
400
  try:
401
- import cv2 # optional optimization; if unavailable fallback to PIL
402
- cap = cv2.VideoCapture(path)
403
- if not cap.isOpened(): return 1.0
404
- frames=0; acc=0.0
405
- while frames<10:
406
- ret, frame = cap.read()
407
- if not ret: break
408
- gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
409
- edges = cv2.Canny(gray,100,200)
410
- acc += edges.mean()/255.0
411
- frames+=1
412
- cap.release()
413
- return acc/max(frames,1)
414
  except Exception:
415
- # PIL fallback (single frame)
416
- try:
417
- # extract a frame via ffmpeg
418
- tmp = f"frame_{uid()}.png"
419
- subprocess.run(["ffmpeg","-i",path,"-vf","scale=320:-1","-vframes","1",tmp,"-y"],
420
- stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
421
- img = Image.open(tmp).convert("L")
422
- arr = np.array(img.filter(ImageFilter.FIND_EDGES))
423
- val = arr.mean()/255.0
424
- os.remove(tmp)
425
- return val
426
- except Exception:
427
- return 1.0 # assume ok if cannot measure
428
 
429
  # ---------------- Concatenate & Mux ----------------
430
  def concat_and_mux(video_paths: List[str], audio_path: str, out_path: str):
@@ -445,18 +442,18 @@ def concat_and_mux(video_paths: List[str], audio_path: str, out_path: str):
445
  try: os.remove(p)
446
  except OSError: pass
447
 
448
- # ---------------- Global Style ----------------
449
- GLOBAL_STYLE = "cinematic, cohesive composition, natural volumetric light, filmic color grade, gentle motion, high detail"
450
-
451
  def build_scene_prompt(sc: Dict[str,str]) -> str:
 
 
 
452
  base = f"{sc['subject']} {sc['action']}, {sc['camera']}, {sc['lighting']}, {sc['mood']}, {sc['style']}"
453
- merged = sc.get("prompt") or base
454
- return f"{merged}. {GLOBAL_STYLE}"
455
 
456
  # ---------------- Main Pipeline ----------------
457
  def generate_video(
458
  topic: str,
459
- keyframes: list, # list of file paths
460
  scene_count: int,
461
  clip_duration: int,
462
  ratio: str,
@@ -479,8 +476,7 @@ def generate_video(
479
  scene_count = max(1,min(MAX_SCENES,scene_count))
480
  if clip_duration not in ALLOWED_DURATIONS:
481
  clip_duration=5
482
- # choose model
483
- runway_model = "gen4" if quality_mode else "gen4_turbo"
484
 
485
  progress(0.05, desc="πŸ” Researching...")
486
  facts = research_topic(topic)
@@ -494,9 +490,12 @@ def generate_video(
494
  audio_path=f"narration_{job}.mp3"
495
  temp_files.append(audio_path)
496
 
497
- voice_id=""
498
  if voice_choice and "|" in voice_choice:
499
- voice_id = voice_choice.split("|",1)[1]
 
 
 
500
 
501
  tts_ok=False
502
  if ELEVEN_KEY and voice_id:
@@ -506,15 +505,14 @@ def generate_video(
506
  streaming_tts, audio_path
507
  )
508
  if not tts_ok and RUNWAY_AUDIO_FALLBACK:
509
- tts_ok = runway_generate_audio(narration, audio_path)
510
  if not tts_ok:
511
  silent_track(narration, audio_path)
512
 
513
  progress(0.40, desc="πŸ–ΌοΈ Preparing keyframes...")
514
- # Handle multi-keyframe: if multiple, cycle through them; else create placeholder
515
  loaded_keyframes=[]
516
  if keyframes:
517
- for fp in keyframes:
518
  try:
519
  img=Image.open(fp).convert("RGB")
520
  loaded_keyframes.append(img)
@@ -525,31 +523,28 @@ def generate_video(
525
  temp_files.append(placeholder)
526
  loaded_keyframes=[Image.open(placeholder).convert("RGB")]
527
 
528
- # Ratio handling
529
  if ratio not in SUPPORTED_RATIOS:
530
  ratio_choice = closest_supported_ratio(*loaded_keyframes[0].size)
531
  else:
532
  ratio_choice = ratio
533
- processed_images=[]
 
534
  for img in loaded_keyframes:
535
- proc = crop_to_ratio(img, ratio_choice)
536
- processed_images.append(proc)
537
 
538
- # Convert processed images to data URIs
539
  data_uris=[]
540
- for img in processed_images:
541
- b = bytes()
542
- from io import BytesIO
543
  buf=BytesIO()
544
  img.save(buf, format="PNG")
545
- b=buf.getvalue()
546
- data_uris.append("data:image/png;base64,"+base64.b64encode(b).decode("utf-8"))
547
 
548
  video_clips=[]
549
  for idx, sc in enumerate(scene_objs, start=1):
550
  progress(0.40 + 0.45*idx/scene_count,
551
  desc=f"🎬 Scene {idx}/{scene_count}...")
552
- img_uri = data_uris[(idx-1) % len(data_uris)]
553
  prompt_text = build_scene_prompt(sc)
554
  clip_path = runway_generate_clip(
555
  model=runway_model,
@@ -560,7 +555,6 @@ def generate_video(
560
  )
561
  video_clips.append(clip_path); temp_files.append(clip_path)
562
 
563
- # Sharpness check
564
  sharp = clip_edge_density(clip_path)
565
  if sharp < SHARPNESS_MIN:
566
  log.info(f"Scene {idx} low sharpness ({sharp:.4f}) - retrying with detail boost")
@@ -606,10 +600,9 @@ def refresh_voices():
606
 
607
  # ---------------- Gradio Interface ----------------
608
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
609
- gr.Markdown("# 🎬 AI Video Studio (Gen-4 / Turbo + Gemini + ElevenLabs + Runway Audio)")
610
  gr.Markdown(
611
- "Iterate quickly with Turbo, then switch to Quality Mode for final fidelity. "
612
- "Upload multiple keyframes to improve subject consistency."
613
  )
614
 
615
  with gr.Row():
@@ -620,9 +613,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
620
  scene_count = gr.Slider(1, MAX_SCENES, value=DEFAULT_SCENES, step=1, label="Scenes")
621
  clip_duration = gr.Radio(choices=sorted(list(ALLOWED_DURATIONS)), value=5, label="Seconds/Scene")
622
  ratio = gr.Dropdown(choices=sorted(list(SUPPORTED_RATIOS)), value="1280:720", label="Aspect Ratio")
623
- quality_mode = gr.Checkbox(label="Quality Mode (use gen4 instead of gen4_turbo)", value=False)
624
 
625
- gr.Markdown("### Narration (Primary: ElevenLabs, Fallback: Runway Audio / Silence)")
626
  with gr.Row():
627
  refresh_btn = gr.Button("πŸ”„ Refresh Voices")
628
  voices_dd = gr.Dropdown(choices=[], label="ElevenLabs Voice (Name|ID)")
@@ -656,10 +649,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
656
 
657
  gr.Markdown(
658
  "### Tips\n"
659
- "- Use multiple high-quality keyframes (consistent character & environment).\n"
660
- "- Refine camera verbs (slow dolly in, handheld pan, aerial sweep) & lighting adjectives.\n"
661
- "- Toggle Quality Mode only when you like the blocking to save credits.\n"
662
- "- Add emotional descriptors directly in narration text for richer delivery."
663
  )
664
 
665
  if __name__ == '__main__':
 
1
  """
2
  AI Video Studio (Runway Gen-4 / Gen-4 Turbo + Gemini + Tavily + ElevenLabs + Runway Audio Fallback)
3
+
4
  Features:
5
+ - Quality Mode: choose 'gen4' (higher fidelity) or 'gen4_turbo' (faster iteration). Gen-4 / Turbo accept 5s or 10s durations only.
6
+ - Structured scene schema (Subject | Action | Camera | Lighting | Mood | Style) -> merged prompt.
7
+ - Multi-keyframe support (upload 1–4 images); automatic ratio cropping to supported Runway aspect ratios.
8
+ - ElevenLabs TTS with: pagination, retry, streaming/non-streaming, adjustable stability/similarity/style/speaker boost.
9
+ - Hard fallback default voice ID (env ELEVEN_DEFAULT_VOICE_ID) if dropdown fetch fails.
10
+ - Runway audio silent fallback placeholder (stub) if all TTS fails (replace later with real Runway audio call if available).
11
+ - Sharpness (edge density) heuristic; one automatic re-generation with detail suffix for blurry clips.
12
+ - Clean temporary file housekeeping; robust logging & progress reporting.
13
+
14
+ Environment Variables (required):
15
+ GEMINI_API_KEY
16
+ TAVILY_API_KEY
17
+ RUNWAY_API_KEY (or RUNWAYML_API_SECRET)
18
+ Optional:
19
+ ELEVENLABS_API_KEY (or XI_API_KEY)
20
+ ELEVEN_DEFAULT_VOICE_ID (fallback voice id)
21
+
22
+ Security: NEVER hard-code real API keys in this file.
23
  """
24
 
25
  import os
 
29
  import logging
30
  import subprocess
31
  import base64
 
32
  from pathlib import Path
33
+ from typing import List, Dict, Any, Optional
34
 
35
  import gradio as gr
36
  from PIL import Image, ImageDraw, ImageFont, ImageFilter
37
  import numpy as np
38
 
39
+ # External SDKs
40
  import google.generativeai as genai
41
  from tavily import TavilyClient
42
  from runwayml import RunwayML
43
  import httpx
44
 
45
+ # ---- ElevenLabs (version-agnostic import) ----
46
  try:
47
  from elevenlabs import ElevenLabs
48
  try:
49
+ from elevenlabs.errors import ApiError # may not exist in some versions
50
  except Exception:
51
  ApiError = Exception
52
  except ImportError:
 
66
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
67
  RUNWAY_KEY = os.getenv("RUNWAY_API_KEY") or os.getenv("RUNWAYML_API_SECRET")
68
  ELEVEN_KEY = os.getenv("ELEVENLABS_API_KEY") or os.getenv("XI_API_KEY")
 
69
 
70
+ required_missing = [k for k, v in {
71
  "GEMINI_API_KEY": GEMINI_API_KEY,
72
  "TAVILY_API_KEY": TAVILY_API_KEY,
73
  "RUNWAY_API_KEY": RUNWAY_KEY
74
  }.items() if not v]
75
+ if required_missing:
76
+ raise RuntimeError(f"Missing required API keys: {', '.join(required_missing)}")
77
 
78
  genai.configure(api_key=GEMINI_API_KEY)
79
  tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
 
83
  # ---------------- Constants ----------------
84
  DEFAULT_SCENES = 4
85
  MAX_SCENES = 8
86
+ ALLOWED_DURATIONS = {5, 10} # Runway Gen-4 / Turbo durations (5 or 10 seconds) :contentReference[oaicite:0]{index=0}:contentReference[oaicite:1]{index=1}
87
+ SUPPORTED_RATIOS = {"1280:720", "1584:672", "1104:832", "720:1280", "832:1104", "960:960"} # documented multiple aspect ratios :contentReference[oaicite:2]{index=2}
88
  WORDS_PER_SEC = 2.5
89
  PLACEHOLDER_BG = (16, 18, 24)
90
  PLACEHOLDER_FG = (240, 242, 248)
 
92
  "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
93
  "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
94
  ]
95
+ SHARPNESS_MIN = 0.015
96
  RETRY_DETAIL_SUFFIX = "ultra-detailed textures, crisp focus, refined edges"
97
+ GLOBAL_STYLE = "cinematic, cohesive composition, natural volumetric light, filmic color grade, gentle camera motion, high detail"
98
+
99
+ # Fallback ElevenLabs voice ID (replace with your own or set env var)
100
+ DEFAULT_ELEVEN_VOICE_ID = os.getenv("ELEVEN_DEFAULT_VOICE_ID", "21m00Tcm4TlvDq8ikWAM") # example/published sample id
101
+ RUNWAY_AUDIO_FALLBACK = True # Placeholder stub (replace with real Runway audio generation when available)
102
 
103
  # ---------------- Utility ----------------
104
  def uid() -> str:
 
108
  safe = "".join(c for c in name if c.isalnum() or c in ("-","_"))[:60]
109
  return safe or "video"
110
 
111
+ def load_font(size: int = 44):
112
  for p in FONT_CANDIDATES:
113
  if Path(p).exists():
114
  try:
 
131
  else:
132
  line.append(w)
133
  if line: lines.append(" ".join(line))
134
+ # center vertically
135
+ metrics=[]; total_h=0
136
  for ln in lines:
137
  bbox = draw.textbbox((0,0), ln, font=font)
138
  h=bbox[3]-bbox[1]
139
+ metrics.append((ln,h,bbox))
140
+ total_h += h+12
141
  y=(height-total_h)//2
142
  for ln,h,bbox in metrics:
143
  w=bbox[2]-bbox[0]
 
148
  img.save(out)
149
  return out
150
 
 
 
 
 
151
  def closest_supported_ratio(w: int, h: int) -> str:
 
152
  candidates=[]
153
+ cur_ratio = w / h
154
  for r in SUPPORTED_RATIOS:
155
+ rw,rh = map(int,r.split(":"))
156
+ diff = abs(cur_ratio - (rw/rh))
 
 
157
  candidates.append((diff,r))
158
  candidates.sort()
159
  return candidates[0][1]
160
 
161
  def crop_to_ratio(img: Image.Image, ratio: str) -> Image.Image:
162
+ rw,rh = map(int, ratio.split(":"))
163
+ target = rw / rh
164
+ w,h = img.size
165
+ cur = w / h
166
+ if abs(cur-target)<1e-3:
167
  return img
168
+ if cur>target: # too wide
 
169
  new_w=int(target*h)
170
  x0=(w-new_w)//2
171
  return img.crop((x0,0,x0+new_w,h))
172
+ else: # too tall
 
173
  new_h=int(w/target)
174
  y0=(h-new_h)//2
175
  return img.crop((0,y0,w,y0+new_h))
 
191
 
192
  # ---------------- Gemini Script Generation ----------------
193
  def gemini_script(topic: str, facts: str, scene_count: int) -> Dict[str,Any]:
194
+ """
195
+ Request structured JSON with narration + scene objects containing schema fields.
196
+ """
197
  prompt = f"""
198
  You are a creative director.
199
 
 
213
  "lighting": "...",
214
  "mood": "...",
215
  "style": "...",
216
+ "prompt": "<merged scene prompt (<=40 words)>"
217
  }}
218
+ (exactly {scene_count} objects)
219
  ]
220
  }}
221
 
222
  Rules:
223
+ - Keep one consistent main subject across scenes unless evolution is explicitly helpful.
224
+ - camera: ONE motion (e.g. "slow dolly in", "handheld pan", "aerial sweep").
225
+ - lighting: descriptive & cinematic (e.g. "golden hour rim light").
226
+ - style: filmic adjectives (e.g. "35mm film grain, rich color palette").
227
+ - merged prompt must integrate key fields succinctly.
228
+ - No markdown, no lists, no commentary outside JSON.
229
  """
230
  model = genai.GenerativeModel("gemini-1.5-flash")
231
  response = model.generate_content(prompt)
 
251
  norm=[]
252
  for sc in scenes:
253
  if not isinstance(sc,dict): continue
254
+ prompt_txt = sc.get("prompt") or ""
255
  norm.append({
256
  "subject": sc.get("subject",""),
257
  "action": sc.get("action",""),
 
266
  "subject":"main subject",
267
  "action":"subtle motion",
268
  "camera":"slow dolly in",
269
+ "lighting":"soft directional key light",
270
  "mood":"cinematic",
271
  "style":"filmic grain",
272
  "prompt":f"Cinematic slow dolly in of main subject, soft directional light, filmic grain, {topic}"
 
274
  norm=norm[:scene_count]
275
  return {"narration": narration, "scenes": norm}
276
 
277
+ # ---------------- ElevenLabs Voice Handling ----------------
278
+ def fetch_voices_paginated(max_pages=5, page_size=50, delay=0.5) -> List[Dict[str,str]]:
279
  if not eleven_client:
280
  return []
281
  voices=[]
 
293
  if not token:
294
  break
295
  time.sleep(delay)
296
+ log.info(f"Fetched {len(voices)} ElevenLabs voices.")
297
  return voices
298
 
299
  def tts_elevenlabs(text: str, voice_id: str, model_id: str,
300
  stability: float, similarity: float,
301
  style: float, speaker_boost: bool,
302
  streaming: bool, out_path: str) -> bool:
303
+ if not eleven_client:
304
+ log.warning("ElevenLabs client not initialized.")
305
+ return False
306
+ if not voice_id:
307
+ log.warning("No voice_id provided for TTS.")
308
  return False
309
  try:
 
310
  stability=max(0,min(1,stability))
311
  similarity=max(0,min(1,similarity))
312
  style=max(0,min(1,style))
 
335
  )
336
  with open(out_path,"wb") as f:
337
  f.write(audio)
338
+ # sanity size check
339
+ if os.path.getsize(out_path) < 800:
340
+ log.error("ElevenLabs audio too small; treating as failure.")
341
+ return False
342
  return True
343
  except ApiError as e:
344
  log.error(f"ElevenLabs ApiError: {e}")
 
346
  log.error(f"ElevenLabs TTS error: {e}")
347
  return False
348
 
349
+ # ---------------- Runway Audio Fallback (placeholder silent track) ----------------
350
+ def runway_audio_fallback(text: str, out_path: str) -> bool:
 
 
 
 
351
  if not RUNWAY_AUDIO_FALLBACK:
352
  return False
353
  try:
 
 
354
  duration = max(2.0, min(300.0, len(text.split())/WORDS_PER_SEC))
355
  subprocess.run([
356
  "ffmpeg","-f","lavfi","-i","anullsrc=r=44100:cl=mono",
 
362
  log.error(f"Runway audio fallback failed: {e}")
363
  return False
364
 
 
365
  def silent_track(narration: str, out_path: str):
366
  duration = max(2.0, min(300.0, len(narration.split())/WORDS_PER_SEC))
367
  subprocess.run([
 
380
  prompt_text=text_prompt,
381
  duration=duration,
382
  ratio=ratio
383
+ ) # API pattern for gen4 / turbo image-to-video :contentReference[oaicite:3]{index=3}:contentReference[oaicite:4]{index=4}
384
  except Exception as e:
385
  raise gr.Error(f"Runway task creation failed: {e}")
386
 
 
409
 
410
  # ---------------- Sharpness Heuristic ----------------
411
  def clip_edge_density(path: str) -> float:
412
+ # Quick heuristic using FFmpeg + PIL (avoid heavy deps if opencv absent)
413
  try:
414
+ tmp = f"frame_{uid()}.png"
415
+ subprocess.run([
416
+ "ffmpeg","-i",path,"-vf","scale=320:-1","-vframes","1",tmp,"-y"
417
+ ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
418
+ img = Image.open(tmp).convert("L")
419
+ arr = np.array(img.filter(ImageFilter.FIND_EDGES))
420
+ val = arr.mean()/255.0
421
+ os.remove(tmp)
422
+ return val
 
 
 
 
423
  except Exception:
424
+ return 1.0 # assume acceptable if analysis fails
 
 
 
 
 
 
 
 
 
 
 
 
425
 
426
  # ---------------- Concatenate & Mux ----------------
427
  def concat_and_mux(video_paths: List[str], audio_path: str, out_path: str):
 
442
  try: os.remove(p)
443
  except OSError: pass
444
 
445
+ # ---------------- Prompt Assembly ----------------
 
 
446
  def build_scene_prompt(sc: Dict[str,str]) -> str:
447
+ merged = sc.get("prompt") or ""
448
+ if merged:
449
+ return f"{merged}. {GLOBAL_STYLE}"
450
  base = f"{sc['subject']} {sc['action']}, {sc['camera']}, {sc['lighting']}, {sc['mood']}, {sc['style']}"
451
+ return f"{base}. {GLOBAL_STYLE}"
 
452
 
453
  # ---------------- Main Pipeline ----------------
454
  def generate_video(
455
  topic: str,
456
+ keyframes: list,
457
  scene_count: int,
458
  clip_duration: int,
459
  ratio: str,
 
476
  scene_count = max(1,min(MAX_SCENES,scene_count))
477
  if clip_duration not in ALLOWED_DURATIONS:
478
  clip_duration=5
479
+ runway_model = "gen4" if quality_mode else "gen4_turbo" # trade speed vs fidelity :contentReference[oaicite:5]{index=5}:contentReference[oaicite:6]{index=6}
 
480
 
481
  progress(0.05, desc="πŸ” Researching...")
482
  facts = research_topic(topic)
 
490
  audio_path=f"narration_{job}.mp3"
491
  temp_files.append(audio_path)
492
 
493
+ # Determine voice id (UI or default fallback)
494
  if voice_choice and "|" in voice_choice:
495
+ voice_id = voice_choice.split("|",1)[1].strip()
496
+ else:
497
+ voice_id = DEFAULT_ELEVEN_VOICE_ID
498
+ log.info(f"[JOB {job}] Using voice_id='{voice_id}' model_id='{model_id}' (quality={quality_mode})")
499
 
500
  tts_ok=False
501
  if ELEVEN_KEY and voice_id:
 
505
  streaming_tts, audio_path
506
  )
507
  if not tts_ok and RUNWAY_AUDIO_FALLBACK:
508
+ tts_ok = runway_audio_fallback(narration, audio_path)
509
  if not tts_ok:
510
  silent_track(narration, audio_path)
511
 
512
  progress(0.40, desc="πŸ–ΌοΈ Preparing keyframes...")
 
513
  loaded_keyframes=[]
514
  if keyframes:
515
+ for fp in keyframes[:4]:
516
  try:
517
  img=Image.open(fp).convert("RGB")
518
  loaded_keyframes.append(img)
 
523
  temp_files.append(placeholder)
524
  loaded_keyframes=[Image.open(placeholder).convert("RGB")]
525
 
 
526
  if ratio not in SUPPORTED_RATIOS:
527
  ratio_choice = closest_supported_ratio(*loaded_keyframes[0].size)
528
  else:
529
  ratio_choice = ratio
530
+
531
+ processed=[]
532
  for img in loaded_keyframes:
533
+ processed.append(crop_to_ratio(img, ratio_choice))
 
534
 
535
+ # Data URIs for Runway image_to_video
536
  data_uris=[]
537
+ from io import BytesIO
538
+ for img in processed:
 
539
  buf=BytesIO()
540
  img.save(buf, format="PNG")
541
+ data_uris.append("data:image/png;base64,"+base64.b64encode(buf.getvalue()).decode("utf-8"))
 
542
 
543
  video_clips=[]
544
  for idx, sc in enumerate(scene_objs, start=1):
545
  progress(0.40 + 0.45*idx/scene_count,
546
  desc=f"🎬 Scene {idx}/{scene_count}...")
547
+ img_uri = data_uris[(idx-1)%len(data_uris)]
548
  prompt_text = build_scene_prompt(sc)
549
  clip_path = runway_generate_clip(
550
  model=runway_model,
 
555
  )
556
  video_clips.append(clip_path); temp_files.append(clip_path)
557
 
 
558
  sharp = clip_edge_density(clip_path)
559
  if sharp < SHARPNESS_MIN:
560
  log.info(f"Scene {idx} low sharpness ({sharp:.4f}) - retrying with detail boost")
 
600
 
601
  # ---------------- Gradio Interface ----------------
602
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
603
+ gr.Markdown("# 🎬 AI Video Studio (Gen-4 / Turbo + Gemini + ElevenLabs)")
604
  gr.Markdown(
605
+ "Iterate with Turbo, finalize with Gen-4. Upload up to 4 keyframes for stronger subject consistency."
 
606
  )
607
 
608
  with gr.Row():
 
613
  scene_count = gr.Slider(1, MAX_SCENES, value=DEFAULT_SCENES, step=1, label="Scenes")
614
  clip_duration = gr.Radio(choices=sorted(list(ALLOWED_DURATIONS)), value=5, label="Seconds/Scene")
615
  ratio = gr.Dropdown(choices=sorted(list(SUPPORTED_RATIOS)), value="1280:720", label="Aspect Ratio")
616
+ quality_mode = gr.Checkbox(label="Quality Mode (gen4 vs gen4_turbo)", value=False)
617
 
618
+ gr.Markdown("### Narration (ElevenLabs primary; fallback silent track)")
619
  with gr.Row():
620
  refresh_btn = gr.Button("πŸ”„ Refresh Voices")
621
  voices_dd = gr.Dropdown(choices=[], label="ElevenLabs Voice (Name|ID)")
 
649
 
650
  gr.Markdown(
651
  "### Tips\n"
652
+ "- Use detailed keyframes with clear subject & lighting.\n"
653
+ "- Add emotional descriptors directly in narration text for richer prosody.\n"
654
+ "- Iterate with Turbo then switch to Quality Mode to finalize.\n"
655
+ "- Adjust Stability/Similarity for expressiveness vs consistency."
656
  )
657
 
658
  if __name__ == '__main__':