Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,14 +1,25 @@
|
|
1 |
"""
|
2 |
AI Video Studio (Runway Gen-4 / Gen-4 Turbo + Gemini + Tavily + ElevenLabs + Runway Audio Fallback)
|
|
|
3 |
Features:
|
4 |
-
- Quality Mode: choose 'gen4' (higher fidelity) or 'gen4_turbo' (faster iteration).
|
5 |
-
- Structured
|
6 |
-
- Multi-keyframe support (
|
7 |
-
-
|
8 |
-
-
|
9 |
-
- Runway
|
10 |
-
-
|
11 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
"""
|
13 |
|
14 |
import os
|
@@ -18,24 +29,24 @@ import random
|
|
18 |
import logging
|
19 |
import subprocess
|
20 |
import base64
|
21 |
-
import math
|
22 |
from pathlib import Path
|
23 |
-
from typing import List, Dict, Any, Optional
|
24 |
|
25 |
import gradio as gr
|
26 |
from PIL import Image, ImageDraw, ImageFont, ImageFilter
|
27 |
import numpy as np
|
28 |
|
|
|
29 |
import google.generativeai as genai
|
30 |
from tavily import TavilyClient
|
31 |
from runwayml import RunwayML
|
32 |
import httpx
|
33 |
|
34 |
-
# ---- ElevenLabs (version-agnostic
|
35 |
try:
|
36 |
from elevenlabs import ElevenLabs
|
37 |
try:
|
38 |
-
from elevenlabs.errors import ApiError # may
|
39 |
except Exception:
|
40 |
ApiError = Exception
|
41 |
except ImportError:
|
@@ -55,15 +66,14 @@ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
|
55 |
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
|
56 |
RUNWAY_KEY = os.getenv("RUNWAY_API_KEY") or os.getenv("RUNWAYML_API_SECRET")
|
57 |
ELEVEN_KEY = os.getenv("ELEVENLABS_API_KEY") or os.getenv("XI_API_KEY")
|
58 |
-
RUNWAY_AUDIO_FALLBACK = True # toggle fallback usage
|
59 |
|
60 |
-
|
61 |
"GEMINI_API_KEY": GEMINI_API_KEY,
|
62 |
"TAVILY_API_KEY": TAVILY_API_KEY,
|
63 |
"RUNWAY_API_KEY": RUNWAY_KEY
|
64 |
}.items() if not v]
|
65 |
-
if
|
66 |
-
raise RuntimeError(f"Missing required API keys: {', '.join(
|
67 |
|
68 |
genai.configure(api_key=GEMINI_API_KEY)
|
69 |
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
|
@@ -73,8 +83,8 @@ eleven_client = ElevenLabs(api_key=ELEVEN_KEY) if (ELEVEN_KEY and ElevenLabs) el
|
|
73 |
# ---------------- Constants ----------------
|
74 |
DEFAULT_SCENES = 4
|
75 |
MAX_SCENES = 8
|
76 |
-
ALLOWED_DURATIONS = {5, 10}
|
77 |
-
SUPPORTED_RATIOS = {"1280:720", "1584:672", "1104:832", "720:1280", "832:1104", "960:960"}
|
78 |
WORDS_PER_SEC = 2.5
|
79 |
PLACEHOLDER_BG = (16, 18, 24)
|
80 |
PLACEHOLDER_FG = (240, 242, 248)
|
@@ -82,8 +92,13 @@ FONT_CANDIDATES = [
|
|
82 |
"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
|
83 |
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
|
84 |
]
|
85 |
-
SHARPNESS_MIN = 0.015
|
86 |
RETRY_DETAIL_SUFFIX = "ultra-detailed textures, crisp focus, refined edges"
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
# ---------------- Utility ----------------
|
89 |
def uid() -> str:
|
@@ -93,7 +108,7 @@ def sanitize_filename(name: str) -> str:
|
|
93 |
safe = "".join(c for c in name if c.isalnum() or c in ("-","_"))[:60]
|
94 |
return safe or "video"
|
95 |
|
96 |
-
def load_font(size: int =
|
97 |
for p in FONT_CANDIDATES:
|
98 |
if Path(p).exists():
|
99 |
try:
|
@@ -116,12 +131,13 @@ def generate_placeholder_image(topic: str, width=768, height=432) -> str:
|
|
116 |
else:
|
117 |
line.append(w)
|
118 |
if line: lines.append(" ".join(line))
|
119 |
-
|
120 |
-
metrics=[]
|
121 |
for ln in lines:
|
122 |
bbox = draw.textbbox((0,0), ln, font=font)
|
123 |
h=bbox[3]-bbox[1]
|
124 |
-
metrics.append((ln,h,bbox))
|
|
|
125 |
y=(height-total_h)//2
|
126 |
for ln,h,bbox in metrics:
|
127 |
w=bbox[2]-bbox[0]
|
@@ -132,36 +148,28 @@ def generate_placeholder_image(topic: str, width=768, height=432) -> str:
|
|
132 |
img.save(out)
|
133 |
return out
|
134 |
|
135 |
-
def aspect_ratio_of(img: Image.Image) -> str:
|
136 |
-
w,h=img.size
|
137 |
-
return f"{w}:{h}"
|
138 |
-
|
139 |
def closest_supported_ratio(w: int, h: int) -> str:
|
140 |
-
# choose ratio minimizing relative area crop after scaling
|
141 |
candidates=[]
|
|
|
142 |
for r in SUPPORTED_RATIOS:
|
143 |
-
rw,rh = map(int,
|
144 |
-
|
145 |
-
cur_ratio = w / h
|
146 |
-
diff = abs(target_ratio - cur_ratio)
|
147 |
candidates.append((diff,r))
|
148 |
candidates.sort()
|
149 |
return candidates[0][1]
|
150 |
|
151 |
def crop_to_ratio(img: Image.Image, ratio: str) -> Image.Image:
|
152 |
-
rw,rh=map(int,ratio.split(":"))
|
153 |
-
target=rw/rh
|
154 |
-
w,h=img.size
|
155 |
-
cur=w/h
|
156 |
-
if abs(cur-target)
|
157 |
return img
|
158 |
-
if cur>target:
|
159 |
-
# too wide
|
160 |
new_w=int(target*h)
|
161 |
x0=(w-new_w)//2
|
162 |
return img.crop((x0,0,x0+new_w,h))
|
163 |
-
else:
|
164 |
-
# too tall
|
165 |
new_h=int(w/target)
|
166 |
y0=(h-new_h)//2
|
167 |
return img.crop((0,y0,w,y0+new_h))
|
@@ -183,6 +191,9 @@ def research_topic(topic: str) -> str:
|
|
183 |
|
184 |
# ---------------- Gemini Script Generation ----------------
|
185 |
def gemini_script(topic: str, facts: str, scene_count: int) -> Dict[str,Any]:
|
|
|
|
|
|
|
186 |
prompt = f"""
|
187 |
You are a creative director.
|
188 |
|
@@ -202,19 +213,19 @@ Return STRICT JSON:
|
|
202 |
"lighting": "...",
|
203 |
"mood": "...",
|
204 |
"style": "...",
|
205 |
-
"prompt": "<
|
206 |
}}
|
207 |
-
(exactly {scene_count} objects
|
208 |
]
|
209 |
}}
|
210 |
|
211 |
Rules:
|
212 |
-
- subject
|
213 |
-
- camera
|
214 |
-
- lighting (e.g. "golden hour rim light"
|
215 |
-
-
|
216 |
-
-
|
217 |
-
-
|
218 |
"""
|
219 |
model = genai.GenerativeModel("gemini-1.5-flash")
|
220 |
response = model.generate_content(prompt)
|
@@ -240,7 +251,7 @@ Rules:
|
|
240 |
norm=[]
|
241 |
for sc in scenes:
|
242 |
if not isinstance(sc,dict): continue
|
243 |
-
prompt_txt = sc.get("prompt") or "
|
244 |
norm.append({
|
245 |
"subject": sc.get("subject",""),
|
246 |
"action": sc.get("action",""),
|
@@ -255,7 +266,7 @@ Rules:
|
|
255 |
"subject":"main subject",
|
256 |
"action":"subtle motion",
|
257 |
"camera":"slow dolly in",
|
258 |
-
"lighting":"soft directional light",
|
259 |
"mood":"cinematic",
|
260 |
"style":"filmic grain",
|
261 |
"prompt":f"Cinematic slow dolly in of main subject, soft directional light, filmic grain, {topic}"
|
@@ -263,8 +274,8 @@ Rules:
|
|
263 |
norm=norm[:scene_count]
|
264 |
return {"narration": narration, "scenes": norm}
|
265 |
|
266 |
-
# ---------------- ElevenLabs ----------------
|
267 |
-
def fetch_voices_paginated(max_pages=5, page_size=50, delay=0.
|
268 |
if not eleven_client:
|
269 |
return []
|
270 |
voices=[]
|
@@ -282,16 +293,20 @@ def fetch_voices_paginated(max_pages=5, page_size=50, delay=0.6) -> List[Dict[st
|
|
282 |
if not token:
|
283 |
break
|
284 |
time.sleep(delay)
|
|
|
285 |
return voices
|
286 |
|
287 |
def tts_elevenlabs(text: str, voice_id: str, model_id: str,
|
288 |
stability: float, similarity: float,
|
289 |
style: float, speaker_boost: bool,
|
290 |
streaming: bool, out_path: str) -> bool:
|
291 |
-
if not eleven_client
|
|
|
|
|
|
|
|
|
292 |
return False
|
293 |
try:
|
294 |
-
# clamp
|
295 |
stability=max(0,min(1,stability))
|
296 |
similarity=max(0,min(1,similarity))
|
297 |
style=max(0,min(1,style))
|
@@ -320,6 +335,10 @@ def tts_elevenlabs(text: str, voice_id: str, model_id: str,
|
|
320 |
)
|
321 |
with open(out_path,"wb") as f:
|
322 |
f.write(audio)
|
|
|
|
|
|
|
|
|
323 |
return True
|
324 |
except ApiError as e:
|
325 |
log.error(f"ElevenLabs ApiError: {e}")
|
@@ -327,17 +346,11 @@ def tts_elevenlabs(text: str, voice_id: str, model_id: str,
|
|
327 |
log.error(f"ElevenLabs TTS error: {e}")
|
328 |
return False
|
329 |
|
330 |
-
# ---------------- Runway Audio Fallback ----------------
|
331 |
-
def
|
332 |
-
"""
|
333 |
-
Simple fallback using Runway Generative Audio (pseudo-endpoint placeholder).
|
334 |
-
NOTE: Replace with official SDK call if/when available in your Python client.
|
335 |
-
"""
|
336 |
if not RUNWAY_AUDIO_FALLBACK:
|
337 |
return False
|
338 |
try:
|
339 |
-
# Placeholder logic: here we just synthesize silence to keep pipeline moving.
|
340 |
-
# (Integrate actual Runway audio generation when SDK exposes it.)
|
341 |
duration = max(2.0, min(300.0, len(text.split())/WORDS_PER_SEC))
|
342 |
subprocess.run([
|
343 |
"ffmpeg","-f","lavfi","-i","anullsrc=r=44100:cl=mono",
|
@@ -349,7 +362,6 @@ def runway_generate_audio(text: str, out_path: str) -> bool:
|
|
349 |
log.error(f"Runway audio fallback failed: {e}")
|
350 |
return False
|
351 |
|
352 |
-
# ---------------- Mock / Silent Fallback ----------------
|
353 |
def silent_track(narration: str, out_path: str):
|
354 |
duration = max(2.0, min(300.0, len(narration.split())/WORDS_PER_SEC))
|
355 |
subprocess.run([
|
@@ -368,7 +380,7 @@ def runway_generate_clip(model: str, prompt_image: str, text_prompt: str,
|
|
368 |
prompt_text=text_prompt,
|
369 |
duration=duration,
|
370 |
ratio=ratio
|
371 |
-
)
|
372 |
except Exception as e:
|
373 |
raise gr.Error(f"Runway task creation failed: {e}")
|
374 |
|
@@ -397,34 +409,19 @@ def runway_generate_clip(model: str, prompt_image: str, text_prompt: str,
|
|
397 |
|
398 |
# ---------------- Sharpness Heuristic ----------------
|
399 |
def clip_edge_density(path: str) -> float:
|
|
|
400 |
try:
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
acc += edges.mean()/255.0
|
411 |
-
frames+=1
|
412 |
-
cap.release()
|
413 |
-
return acc/max(frames,1)
|
414 |
except Exception:
|
415 |
-
#
|
416 |
-
try:
|
417 |
-
# extract a frame via ffmpeg
|
418 |
-
tmp = f"frame_{uid()}.png"
|
419 |
-
subprocess.run(["ffmpeg","-i",path,"-vf","scale=320:-1","-vframes","1",tmp,"-y"],
|
420 |
-
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
|
421 |
-
img = Image.open(tmp).convert("L")
|
422 |
-
arr = np.array(img.filter(ImageFilter.FIND_EDGES))
|
423 |
-
val = arr.mean()/255.0
|
424 |
-
os.remove(tmp)
|
425 |
-
return val
|
426 |
-
except Exception:
|
427 |
-
return 1.0 # assume ok if cannot measure
|
428 |
|
429 |
# ---------------- Concatenate & Mux ----------------
|
430 |
def concat_and_mux(video_paths: List[str], audio_path: str, out_path: str):
|
@@ -445,18 +442,18 @@ def concat_and_mux(video_paths: List[str], audio_path: str, out_path: str):
|
|
445 |
try: os.remove(p)
|
446 |
except OSError: pass
|
447 |
|
448 |
-
# ----------------
|
449 |
-
GLOBAL_STYLE = "cinematic, cohesive composition, natural volumetric light, filmic color grade, gentle motion, high detail"
|
450 |
-
|
451 |
def build_scene_prompt(sc: Dict[str,str]) -> str:
|
|
|
|
|
|
|
452 |
base = f"{sc['subject']} {sc['action']}, {sc['camera']}, {sc['lighting']}, {sc['mood']}, {sc['style']}"
|
453 |
-
|
454 |
-
return f"{merged}. {GLOBAL_STYLE}"
|
455 |
|
456 |
# ---------------- Main Pipeline ----------------
|
457 |
def generate_video(
|
458 |
topic: str,
|
459 |
-
keyframes: list,
|
460 |
scene_count: int,
|
461 |
clip_duration: int,
|
462 |
ratio: str,
|
@@ -479,8 +476,7 @@ def generate_video(
|
|
479 |
scene_count = max(1,min(MAX_SCENES,scene_count))
|
480 |
if clip_duration not in ALLOWED_DURATIONS:
|
481 |
clip_duration=5
|
482 |
-
#
|
483 |
-
runway_model = "gen4" if quality_mode else "gen4_turbo"
|
484 |
|
485 |
progress(0.05, desc="π Researching...")
|
486 |
facts = research_topic(topic)
|
@@ -494,9 +490,12 @@ def generate_video(
|
|
494 |
audio_path=f"narration_{job}.mp3"
|
495 |
temp_files.append(audio_path)
|
496 |
|
497 |
-
|
498 |
if voice_choice and "|" in voice_choice:
|
499 |
-
voice_id = voice_choice.split("|",1)[1]
|
|
|
|
|
|
|
500 |
|
501 |
tts_ok=False
|
502 |
if ELEVEN_KEY and voice_id:
|
@@ -506,15 +505,14 @@ def generate_video(
|
|
506 |
streaming_tts, audio_path
|
507 |
)
|
508 |
if not tts_ok and RUNWAY_AUDIO_FALLBACK:
|
509 |
-
tts_ok =
|
510 |
if not tts_ok:
|
511 |
silent_track(narration, audio_path)
|
512 |
|
513 |
progress(0.40, desc="πΌοΈ Preparing keyframes...")
|
514 |
-
# Handle multi-keyframe: if multiple, cycle through them; else create placeholder
|
515 |
loaded_keyframes=[]
|
516 |
if keyframes:
|
517 |
-
for fp in keyframes:
|
518 |
try:
|
519 |
img=Image.open(fp).convert("RGB")
|
520 |
loaded_keyframes.append(img)
|
@@ -525,31 +523,28 @@ def generate_video(
|
|
525 |
temp_files.append(placeholder)
|
526 |
loaded_keyframes=[Image.open(placeholder).convert("RGB")]
|
527 |
|
528 |
-
# Ratio handling
|
529 |
if ratio not in SUPPORTED_RATIOS:
|
530 |
ratio_choice = closest_supported_ratio(*loaded_keyframes[0].size)
|
531 |
else:
|
532 |
ratio_choice = ratio
|
533 |
-
|
|
|
534 |
for img in loaded_keyframes:
|
535 |
-
|
536 |
-
processed_images.append(proc)
|
537 |
|
538 |
-
#
|
539 |
data_uris=[]
|
540 |
-
|
541 |
-
|
542 |
-
from io import BytesIO
|
543 |
buf=BytesIO()
|
544 |
img.save(buf, format="PNG")
|
545 |
-
|
546 |
-
data_uris.append("data:image/png;base64,"+base64.b64encode(b).decode("utf-8"))
|
547 |
|
548 |
video_clips=[]
|
549 |
for idx, sc in enumerate(scene_objs, start=1):
|
550 |
progress(0.40 + 0.45*idx/scene_count,
|
551 |
desc=f"π¬ Scene {idx}/{scene_count}...")
|
552 |
-
img_uri = data_uris[(idx-1)
|
553 |
prompt_text = build_scene_prompt(sc)
|
554 |
clip_path = runway_generate_clip(
|
555 |
model=runway_model,
|
@@ -560,7 +555,6 @@ def generate_video(
|
|
560 |
)
|
561 |
video_clips.append(clip_path); temp_files.append(clip_path)
|
562 |
|
563 |
-
# Sharpness check
|
564 |
sharp = clip_edge_density(clip_path)
|
565 |
if sharp < SHARPNESS_MIN:
|
566 |
log.info(f"Scene {idx} low sharpness ({sharp:.4f}) - retrying with detail boost")
|
@@ -606,10 +600,9 @@ def refresh_voices():
|
|
606 |
|
607 |
# ---------------- Gradio Interface ----------------
|
608 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
609 |
-
gr.Markdown("# π¬ AI Video Studio (Gen-4 / Turbo + Gemini + ElevenLabs
|
610 |
gr.Markdown(
|
611 |
-
"Iterate
|
612 |
-
"Upload multiple keyframes to improve subject consistency."
|
613 |
)
|
614 |
|
615 |
with gr.Row():
|
@@ -620,9 +613,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
620 |
scene_count = gr.Slider(1, MAX_SCENES, value=DEFAULT_SCENES, step=1, label="Scenes")
|
621 |
clip_duration = gr.Radio(choices=sorted(list(ALLOWED_DURATIONS)), value=5, label="Seconds/Scene")
|
622 |
ratio = gr.Dropdown(choices=sorted(list(SUPPORTED_RATIOS)), value="1280:720", label="Aspect Ratio")
|
623 |
-
quality_mode = gr.Checkbox(label="Quality Mode (
|
624 |
|
625 |
-
gr.Markdown("### Narration (
|
626 |
with gr.Row():
|
627 |
refresh_btn = gr.Button("π Refresh Voices")
|
628 |
voices_dd = gr.Dropdown(choices=[], label="ElevenLabs Voice (Name|ID)")
|
@@ -656,10 +649,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
656 |
|
657 |
gr.Markdown(
|
658 |
"### Tips\n"
|
659 |
-
"- Use
|
660 |
-
"-
|
661 |
-
"-
|
662 |
-
"-
|
663 |
)
|
664 |
|
665 |
if __name__ == '__main__':
|
|
|
1 |
"""
|
2 |
AI Video Studio (Runway Gen-4 / Gen-4 Turbo + Gemini + Tavily + ElevenLabs + Runway Audio Fallback)
|
3 |
+
|
4 |
Features:
|
5 |
+
- Quality Mode: choose 'gen4' (higher fidelity) or 'gen4_turbo' (faster iteration). Gen-4 / Turbo accept 5s or 10s durations only.
|
6 |
+
- Structured scene schema (Subject | Action | Camera | Lighting | Mood | Style) -> merged prompt.
|
7 |
+
- Multi-keyframe support (upload 1β4 images); automatic ratio cropping to supported Runway aspect ratios.
|
8 |
+
- ElevenLabs TTS with: pagination, retry, streaming/non-streaming, adjustable stability/similarity/style/speaker boost.
|
9 |
+
- Hard fallback default voice ID (env ELEVEN_DEFAULT_VOICE_ID) if dropdown fetch fails.
|
10 |
+
- Runway audio silent fallback placeholder (stub) if all TTS fails (replace later with real Runway audio call if available).
|
11 |
+
- Sharpness (edge density) heuristic; one automatic re-generation with detail suffix for blurry clips.
|
12 |
+
- Clean temporary file housekeeping; robust logging & progress reporting.
|
13 |
+
|
14 |
+
Environment Variables (required):
|
15 |
+
GEMINI_API_KEY
|
16 |
+
TAVILY_API_KEY
|
17 |
+
RUNWAY_API_KEY (or RUNWAYML_API_SECRET)
|
18 |
+
Optional:
|
19 |
+
ELEVENLABS_API_KEY (or XI_API_KEY)
|
20 |
+
ELEVEN_DEFAULT_VOICE_ID (fallback voice id)
|
21 |
+
|
22 |
+
Security: NEVER hard-code real API keys in this file.
|
23 |
"""
|
24 |
|
25 |
import os
|
|
|
29 |
import logging
|
30 |
import subprocess
|
31 |
import base64
|
|
|
32 |
from pathlib import Path
|
33 |
+
from typing import List, Dict, Any, Optional
|
34 |
|
35 |
import gradio as gr
|
36 |
from PIL import Image, ImageDraw, ImageFont, ImageFilter
|
37 |
import numpy as np
|
38 |
|
39 |
+
# External SDKs
|
40 |
import google.generativeai as genai
|
41 |
from tavily import TavilyClient
|
42 |
from runwayml import RunwayML
|
43 |
import httpx
|
44 |
|
45 |
+
# ---- ElevenLabs (version-agnostic import) ----
|
46 |
try:
|
47 |
from elevenlabs import ElevenLabs
|
48 |
try:
|
49 |
+
from elevenlabs.errors import ApiError # may not exist in some versions
|
50 |
except Exception:
|
51 |
ApiError = Exception
|
52 |
except ImportError:
|
|
|
66 |
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
|
67 |
RUNWAY_KEY = os.getenv("RUNWAY_API_KEY") or os.getenv("RUNWAYML_API_SECRET")
|
68 |
ELEVEN_KEY = os.getenv("ELEVENLABS_API_KEY") or os.getenv("XI_API_KEY")
|
|
|
69 |
|
70 |
+
required_missing = [k for k, v in {
|
71 |
"GEMINI_API_KEY": GEMINI_API_KEY,
|
72 |
"TAVILY_API_KEY": TAVILY_API_KEY,
|
73 |
"RUNWAY_API_KEY": RUNWAY_KEY
|
74 |
}.items() if not v]
|
75 |
+
if required_missing:
|
76 |
+
raise RuntimeError(f"Missing required API keys: {', '.join(required_missing)}")
|
77 |
|
78 |
genai.configure(api_key=GEMINI_API_KEY)
|
79 |
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
|
|
|
83 |
# ---------------- Constants ----------------
|
84 |
DEFAULT_SCENES = 4
|
85 |
MAX_SCENES = 8
|
86 |
+
ALLOWED_DURATIONS = {5, 10} # Runway Gen-4 / Turbo durations (5 or 10 seconds) :contentReference[oaicite:0]{index=0}:contentReference[oaicite:1]{index=1}
|
87 |
+
SUPPORTED_RATIOS = {"1280:720", "1584:672", "1104:832", "720:1280", "832:1104", "960:960"} # documented multiple aspect ratios :contentReference[oaicite:2]{index=2}
|
88 |
WORDS_PER_SEC = 2.5
|
89 |
PLACEHOLDER_BG = (16, 18, 24)
|
90 |
PLACEHOLDER_FG = (240, 242, 248)
|
|
|
92 |
"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
|
93 |
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
|
94 |
]
|
95 |
+
SHARPNESS_MIN = 0.015
|
96 |
RETRY_DETAIL_SUFFIX = "ultra-detailed textures, crisp focus, refined edges"
|
97 |
+
GLOBAL_STYLE = "cinematic, cohesive composition, natural volumetric light, filmic color grade, gentle camera motion, high detail"
|
98 |
+
|
99 |
+
# Fallback ElevenLabs voice ID (replace with your own or set env var)
|
100 |
+
DEFAULT_ELEVEN_VOICE_ID = os.getenv("ELEVEN_DEFAULT_VOICE_ID", "21m00Tcm4TlvDq8ikWAM") # example/published sample id
|
101 |
+
RUNWAY_AUDIO_FALLBACK = True # Placeholder stub (replace with real Runway audio generation when available)
|
102 |
|
103 |
# ---------------- Utility ----------------
|
104 |
def uid() -> str:
|
|
|
108 |
safe = "".join(c for c in name if c.isalnum() or c in ("-","_"))[:60]
|
109 |
return safe or "video"
|
110 |
|
111 |
+
def load_font(size: int = 44):
|
112 |
for p in FONT_CANDIDATES:
|
113 |
if Path(p).exists():
|
114 |
try:
|
|
|
131 |
else:
|
132 |
line.append(w)
|
133 |
if line: lines.append(" ".join(line))
|
134 |
+
# center vertically
|
135 |
+
metrics=[]; total_h=0
|
136 |
for ln in lines:
|
137 |
bbox = draw.textbbox((0,0), ln, font=font)
|
138 |
h=bbox[3]-bbox[1]
|
139 |
+
metrics.append((ln,h,bbox))
|
140 |
+
total_h += h+12
|
141 |
y=(height-total_h)//2
|
142 |
for ln,h,bbox in metrics:
|
143 |
w=bbox[2]-bbox[0]
|
|
|
148 |
img.save(out)
|
149 |
return out
|
150 |
|
|
|
|
|
|
|
|
|
151 |
def closest_supported_ratio(w: int, h: int) -> str:
|
|
|
152 |
candidates=[]
|
153 |
+
cur_ratio = w / h
|
154 |
for r in SUPPORTED_RATIOS:
|
155 |
+
rw,rh = map(int,r.split(":"))
|
156 |
+
diff = abs(cur_ratio - (rw/rh))
|
|
|
|
|
157 |
candidates.append((diff,r))
|
158 |
candidates.sort()
|
159 |
return candidates[0][1]
|
160 |
|
161 |
def crop_to_ratio(img: Image.Image, ratio: str) -> Image.Image:
|
162 |
+
rw,rh = map(int, ratio.split(":"))
|
163 |
+
target = rw / rh
|
164 |
+
w,h = img.size
|
165 |
+
cur = w / h
|
166 |
+
if abs(cur-target)<1e-3:
|
167 |
return img
|
168 |
+
if cur>target: # too wide
|
|
|
169 |
new_w=int(target*h)
|
170 |
x0=(w-new_w)//2
|
171 |
return img.crop((x0,0,x0+new_w,h))
|
172 |
+
else: # too tall
|
|
|
173 |
new_h=int(w/target)
|
174 |
y0=(h-new_h)//2
|
175 |
return img.crop((0,y0,w,y0+new_h))
|
|
|
191 |
|
192 |
# ---------------- Gemini Script Generation ----------------
|
193 |
def gemini_script(topic: str, facts: str, scene_count: int) -> Dict[str,Any]:
|
194 |
+
"""
|
195 |
+
Request structured JSON with narration + scene objects containing schema fields.
|
196 |
+
"""
|
197 |
prompt = f"""
|
198 |
You are a creative director.
|
199 |
|
|
|
213 |
"lighting": "...",
|
214 |
"mood": "...",
|
215 |
"style": "...",
|
216 |
+
"prompt": "<merged scene prompt (<=40 words)>"
|
217 |
}}
|
218 |
+
(exactly {scene_count} objects)
|
219 |
]
|
220 |
}}
|
221 |
|
222 |
Rules:
|
223 |
+
- Keep one consistent main subject across scenes unless evolution is explicitly helpful.
|
224 |
+
- camera: ONE motion (e.g. "slow dolly in", "handheld pan", "aerial sweep").
|
225 |
+
- lighting: descriptive & cinematic (e.g. "golden hour rim light").
|
226 |
+
- style: filmic adjectives (e.g. "35mm film grain, rich color palette").
|
227 |
+
- merged prompt must integrate key fields succinctly.
|
228 |
+
- No markdown, no lists, no commentary outside JSON.
|
229 |
"""
|
230 |
model = genai.GenerativeModel("gemini-1.5-flash")
|
231 |
response = model.generate_content(prompt)
|
|
|
251 |
norm=[]
|
252 |
for sc in scenes:
|
253 |
if not isinstance(sc,dict): continue
|
254 |
+
prompt_txt = sc.get("prompt") or ""
|
255 |
norm.append({
|
256 |
"subject": sc.get("subject",""),
|
257 |
"action": sc.get("action",""),
|
|
|
266 |
"subject":"main subject",
|
267 |
"action":"subtle motion",
|
268 |
"camera":"slow dolly in",
|
269 |
+
"lighting":"soft directional key light",
|
270 |
"mood":"cinematic",
|
271 |
"style":"filmic grain",
|
272 |
"prompt":f"Cinematic slow dolly in of main subject, soft directional light, filmic grain, {topic}"
|
|
|
274 |
norm=norm[:scene_count]
|
275 |
return {"narration": narration, "scenes": norm}
|
276 |
|
277 |
+
# ---------------- ElevenLabs Voice Handling ----------------
|
278 |
+
def fetch_voices_paginated(max_pages=5, page_size=50, delay=0.5) -> List[Dict[str,str]]:
|
279 |
if not eleven_client:
|
280 |
return []
|
281 |
voices=[]
|
|
|
293 |
if not token:
|
294 |
break
|
295 |
time.sleep(delay)
|
296 |
+
log.info(f"Fetched {len(voices)} ElevenLabs voices.")
|
297 |
return voices
|
298 |
|
299 |
def tts_elevenlabs(text: str, voice_id: str, model_id: str,
|
300 |
stability: float, similarity: float,
|
301 |
style: float, speaker_boost: bool,
|
302 |
streaming: bool, out_path: str) -> bool:
|
303 |
+
if not eleven_client:
|
304 |
+
log.warning("ElevenLabs client not initialized.")
|
305 |
+
return False
|
306 |
+
if not voice_id:
|
307 |
+
log.warning("No voice_id provided for TTS.")
|
308 |
return False
|
309 |
try:
|
|
|
310 |
stability=max(0,min(1,stability))
|
311 |
similarity=max(0,min(1,similarity))
|
312 |
style=max(0,min(1,style))
|
|
|
335 |
)
|
336 |
with open(out_path,"wb") as f:
|
337 |
f.write(audio)
|
338 |
+
# sanity size check
|
339 |
+
if os.path.getsize(out_path) < 800:
|
340 |
+
log.error("ElevenLabs audio too small; treating as failure.")
|
341 |
+
return False
|
342 |
return True
|
343 |
except ApiError as e:
|
344 |
log.error(f"ElevenLabs ApiError: {e}")
|
|
|
346 |
log.error(f"ElevenLabs TTS error: {e}")
|
347 |
return False
|
348 |
|
349 |
+
# ---------------- Runway Audio Fallback (placeholder silent track) ----------------
|
350 |
+
def runway_audio_fallback(text: str, out_path: str) -> bool:
|
|
|
|
|
|
|
|
|
351 |
if not RUNWAY_AUDIO_FALLBACK:
|
352 |
return False
|
353 |
try:
|
|
|
|
|
354 |
duration = max(2.0, min(300.0, len(text.split())/WORDS_PER_SEC))
|
355 |
subprocess.run([
|
356 |
"ffmpeg","-f","lavfi","-i","anullsrc=r=44100:cl=mono",
|
|
|
362 |
log.error(f"Runway audio fallback failed: {e}")
|
363 |
return False
|
364 |
|
|
|
365 |
def silent_track(narration: str, out_path: str):
|
366 |
duration = max(2.0, min(300.0, len(narration.split())/WORDS_PER_SEC))
|
367 |
subprocess.run([
|
|
|
380 |
prompt_text=text_prompt,
|
381 |
duration=duration,
|
382 |
ratio=ratio
|
383 |
+
) # API pattern for gen4 / turbo image-to-video :contentReference[oaicite:3]{index=3}:contentReference[oaicite:4]{index=4}
|
384 |
except Exception as e:
|
385 |
raise gr.Error(f"Runway task creation failed: {e}")
|
386 |
|
|
|
409 |
|
410 |
# ---------------- Sharpness Heuristic ----------------
|
411 |
def clip_edge_density(path: str) -> float:
|
412 |
+
# Quick heuristic using FFmpeg + PIL (avoid heavy deps if opencv absent)
|
413 |
try:
|
414 |
+
tmp = f"frame_{uid()}.png"
|
415 |
+
subprocess.run([
|
416 |
+
"ffmpeg","-i",path,"-vf","scale=320:-1","-vframes","1",tmp,"-y"
|
417 |
+
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
|
418 |
+
img = Image.open(tmp).convert("L")
|
419 |
+
arr = np.array(img.filter(ImageFilter.FIND_EDGES))
|
420 |
+
val = arr.mean()/255.0
|
421 |
+
os.remove(tmp)
|
422 |
+
return val
|
|
|
|
|
|
|
|
|
423 |
except Exception:
|
424 |
+
return 1.0 # assume acceptable if analysis fails
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
425 |
|
426 |
# ---------------- Concatenate & Mux ----------------
|
427 |
def concat_and_mux(video_paths: List[str], audio_path: str, out_path: str):
|
|
|
442 |
try: os.remove(p)
|
443 |
except OSError: pass
|
444 |
|
445 |
+
# ---------------- Prompt Assembly ----------------
|
|
|
|
|
446 |
def build_scene_prompt(sc: Dict[str,str]) -> str:
|
447 |
+
merged = sc.get("prompt") or ""
|
448 |
+
if merged:
|
449 |
+
return f"{merged}. {GLOBAL_STYLE}"
|
450 |
base = f"{sc['subject']} {sc['action']}, {sc['camera']}, {sc['lighting']}, {sc['mood']}, {sc['style']}"
|
451 |
+
return f"{base}. {GLOBAL_STYLE}"
|
|
|
452 |
|
453 |
# ---------------- Main Pipeline ----------------
|
454 |
def generate_video(
|
455 |
topic: str,
|
456 |
+
keyframes: list,
|
457 |
scene_count: int,
|
458 |
clip_duration: int,
|
459 |
ratio: str,
|
|
|
476 |
scene_count = max(1,min(MAX_SCENES,scene_count))
|
477 |
if clip_duration not in ALLOWED_DURATIONS:
|
478 |
clip_duration=5
|
479 |
+
runway_model = "gen4" if quality_mode else "gen4_turbo" # trade speed vs fidelity :contentReference[oaicite:5]{index=5}:contentReference[oaicite:6]{index=6}
|
|
|
480 |
|
481 |
progress(0.05, desc="π Researching...")
|
482 |
facts = research_topic(topic)
|
|
|
490 |
audio_path=f"narration_{job}.mp3"
|
491 |
temp_files.append(audio_path)
|
492 |
|
493 |
+
# Determine voice id (UI or default fallback)
|
494 |
if voice_choice and "|" in voice_choice:
|
495 |
+
voice_id = voice_choice.split("|",1)[1].strip()
|
496 |
+
else:
|
497 |
+
voice_id = DEFAULT_ELEVEN_VOICE_ID
|
498 |
+
log.info(f"[JOB {job}] Using voice_id='{voice_id}' model_id='{model_id}' (quality={quality_mode})")
|
499 |
|
500 |
tts_ok=False
|
501 |
if ELEVEN_KEY and voice_id:
|
|
|
505 |
streaming_tts, audio_path
|
506 |
)
|
507 |
if not tts_ok and RUNWAY_AUDIO_FALLBACK:
|
508 |
+
tts_ok = runway_audio_fallback(narration, audio_path)
|
509 |
if not tts_ok:
|
510 |
silent_track(narration, audio_path)
|
511 |
|
512 |
progress(0.40, desc="πΌοΈ Preparing keyframes...")
|
|
|
513 |
loaded_keyframes=[]
|
514 |
if keyframes:
|
515 |
+
for fp in keyframes[:4]:
|
516 |
try:
|
517 |
img=Image.open(fp).convert("RGB")
|
518 |
loaded_keyframes.append(img)
|
|
|
523 |
temp_files.append(placeholder)
|
524 |
loaded_keyframes=[Image.open(placeholder).convert("RGB")]
|
525 |
|
|
|
526 |
if ratio not in SUPPORTED_RATIOS:
|
527 |
ratio_choice = closest_supported_ratio(*loaded_keyframes[0].size)
|
528 |
else:
|
529 |
ratio_choice = ratio
|
530 |
+
|
531 |
+
processed=[]
|
532 |
for img in loaded_keyframes:
|
533 |
+
processed.append(crop_to_ratio(img, ratio_choice))
|
|
|
534 |
|
535 |
+
# Data URIs for Runway image_to_video
|
536 |
data_uris=[]
|
537 |
+
from io import BytesIO
|
538 |
+
for img in processed:
|
|
|
539 |
buf=BytesIO()
|
540 |
img.save(buf, format="PNG")
|
541 |
+
data_uris.append("data:image/png;base64,"+base64.b64encode(buf.getvalue()).decode("utf-8"))
|
|
|
542 |
|
543 |
video_clips=[]
|
544 |
for idx, sc in enumerate(scene_objs, start=1):
|
545 |
progress(0.40 + 0.45*idx/scene_count,
|
546 |
desc=f"π¬ Scene {idx}/{scene_count}...")
|
547 |
+
img_uri = data_uris[(idx-1)%len(data_uris)]
|
548 |
prompt_text = build_scene_prompt(sc)
|
549 |
clip_path = runway_generate_clip(
|
550 |
model=runway_model,
|
|
|
555 |
)
|
556 |
video_clips.append(clip_path); temp_files.append(clip_path)
|
557 |
|
|
|
558 |
sharp = clip_edge_density(clip_path)
|
559 |
if sharp < SHARPNESS_MIN:
|
560 |
log.info(f"Scene {idx} low sharpness ({sharp:.4f}) - retrying with detail boost")
|
|
|
600 |
|
601 |
# ---------------- Gradio Interface ----------------
|
602 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
603 |
+
gr.Markdown("# π¬ AI Video Studio (Gen-4 / Turbo + Gemini + ElevenLabs)")
|
604 |
gr.Markdown(
|
605 |
+
"Iterate with Turbo, finalize with Gen-4. Upload up to 4 keyframes for stronger subject consistency."
|
|
|
606 |
)
|
607 |
|
608 |
with gr.Row():
|
|
|
613 |
scene_count = gr.Slider(1, MAX_SCENES, value=DEFAULT_SCENES, step=1, label="Scenes")
|
614 |
clip_duration = gr.Radio(choices=sorted(list(ALLOWED_DURATIONS)), value=5, label="Seconds/Scene")
|
615 |
ratio = gr.Dropdown(choices=sorted(list(SUPPORTED_RATIOS)), value="1280:720", label="Aspect Ratio")
|
616 |
+
quality_mode = gr.Checkbox(label="Quality Mode (gen4 vs gen4_turbo)", value=False)
|
617 |
|
618 |
+
gr.Markdown("### Narration (ElevenLabs primary; fallback silent track)")
|
619 |
with gr.Row():
|
620 |
refresh_btn = gr.Button("π Refresh Voices")
|
621 |
voices_dd = gr.Dropdown(choices=[], label="ElevenLabs Voice (Name|ID)")
|
|
|
649 |
|
650 |
gr.Markdown(
|
651 |
"### Tips\n"
|
652 |
+
"- Use detailed keyframes with clear subject & lighting.\n"
|
653 |
+
"- Add emotional descriptors directly in narration text for richer prosody.\n"
|
654 |
+
"- Iterate with Turbo then switch to Quality Mode to finalize.\n"
|
655 |
+
"- Adjust Stability/Similarity for expressiveness vs consistency."
|
656 |
)
|
657 |
|
658 |
if __name__ == '__main__':
|