#app.py from __future__ import annotations import gradio as gr import os import shutil import datetime from typing import List, Optional # ────────────────────────────────────────────────────────────────────────────── # Import project‑specific helpers — unchanged from initial version # ────────────────────────────────────────────────────────────────────────────── from scripts.generate_scripts import generate_script, generate_title, generate_description from scripts.generate_voice import generate_voice from scripts.get_footage import get_video_montage_from_folder from scripts.edit_video import edit_video from scripts.generate_subtitles import ( transcribe_audio_to_subs, chunk_text_by_words, add_subtitles_to_video, ) import torch from transformers import ( AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, ) # ────────────────────────────────────────────────────────────────────────────── # Constants & utilities # ────────────────────────────────────────────────────────────────────────────── WORDS_PER_SECOND = 2.3 # ≃ 140 wpm ASSETS_DIRS = ( "./assets/audio", "./assets/backgrounds", "./assets/output", "./assets/video_music", ) # ──────────────────────────────────────────────────────── # CONFIGURATION # ──────────────────────────────────────────────────────── MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen3-4B") DTYPE = torch.float16 # or torch.float16 print(f"🔄 Loading {MODEL_ID} (dtype = {DTYPE}) …") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=DTYPE, trust_remote_code=True, ) model.to("cuda" if torch.cuda.is_available() else "cpu") # single-device move DEVICE = next(model.parameters()).device print(f"✅ Model ready on {DEVICE}.") for d in ASSETS_DIRS: os.makedirs(d, exist_ok=True) def safe_copy(src: str, dst: str) -> str: """Copy src → dst unless they are the same file, returns destination path.""" if os.path.abspath(src) == os.path.abspath(dst): return src shutil.copy(src, dst) return dst # Wrapper util to timestamp generated files so different runs don't overwrite each other def timestamped_filename(prefix: str, ext: str) -> str: ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") return os.path.join("./assets/output", f"{prefix}_{ts}.{ext}") # ────────────────────────────────────────────────────────────────────────────── # Independent functional endpoints (Gradio callbacks) # ────────────────────────────────────────────────────────────────────────────── def cb_generate_script( context: str, instruction: str, target_duration: int, script_mode: str, custom_script: Optional[str], ): """Generate (or accept) a script + title/description.""" approx_words = int(target_duration * WORDS_PER_SECOND) if script_mode == "Use my script": if not custom_script or not custom_script.strip(): raise gr.Error("❌ You selected 'Use my script' but the script field is empty!") script = custom_script.strip() else: prompt = ( f"You are a video creation expert. Here is the context: {context.strip()}\n" f"Instruction: {instruction.strip()}\n" f"🔴 Strict target duration: {target_duration}s — ≈ {approx_words} words (must be respected)." ) script = generate_script(model,tokenizer,prompt) title = generate_title(model,tokenizer,script) description = generate_description(model,tokenizer,script) return script, title, description, script # last return for state update def cb_generate_voice(script: str): """TTS generation from a given script, returns path to MP3.""" if not script or not script.strip(): raise gr.Error("❌ Script text is empty – generate or paste a script first.") voice_path = timestamped_filename("voice", "mp3") generate_voice(script, voice_path) return voice_path, voice_path # second value updates state def accumulate_files(new: List[str], state: List[str] | None): """Append only new valid MP4 files to state.""" state = state or [] for f in new or []: if ( isinstance(f, str) and os.path.isfile(f) and f.lower().endswith(".mp4") and f not in state ): state.append(f) return state def cb_create_montage( accumulated_videos: List[str], voice_path: str, lum: float, contrast: float, gamma: float, show_bar: bool, ): """Create the background‑video montage synced to the narration audio.""" if not accumulated_videos: raise gr.Error("❌ Please upload at least one background video (.mp4) before generating the montage.") if not voice_path or not os.path.isfile(voice_path): raise gr.Error("❌ A narration audio file (.mp3) is required – generate or upload one first.") # Clean previous backgrounds, then copy new ones for f in os.listdir("./assets/backgrounds"): if f.lower().endswith(".mp4"): os.remove(os.path.join("./assets/backgrounds", f)) for idx, v in enumerate(accumulated_videos): safe_copy(v, os.path.join("./assets/backgrounds", f"video_{idx:03d}.mp4")) montage_path = timestamped_filename("montage", "mp4") _ = get_video_montage_from_folder( folder_path="./assets/backgrounds", audio_path=voice_path, output_dir="./assets/video_music", lum=lum, contrast=contrast, gamma=gamma, show_progress_bar=show_bar, ) # get_video_montage_from_folder already saves the file – we just need its path # It returns the path, so capture it montage_path = _ return montage_path, montage_path def cb_mix_audio( montage_path: str, voice_path: str, music_file: Optional[str] = None, ): """Combine montage video, voice audio, and optional background music.""" if not montage_path or not os.path.isfile(montage_path): raise gr.Error("❌ Please generate a montage video first.") if not voice_path or not os.path.isfile(voice_path): raise gr.Error("❌ Narration audio missing – generate or upload it.") music_path = music_file if music_file and os.path.isfile(music_file) else None final_no_subs = timestamped_filename("final_no_subs", "mp4") edit_video(montage_path, voice_path, music_path, final_no_subs) return final_no_subs, final_no_subs def cb_add_subtitles(final_no_subs: str, voice_path: str): """Overlay dynamic subtitles on the mixed video.""" if not final_no_subs or not os.path.isfile(final_no_subs): raise gr.Error("❌ Mixed video not found – run the 'Mix Audio/Video' step first.") if not voice_path or not os.path.isfile(voice_path): raise gr.Error("❌ Narration audio missing.") segments = transcribe_audio_to_subs(voice_path) subs = chunk_text_by_words(segments, max_words=3) final_with_subs = timestamped_filename("final_with_subs", "mp4") add_subtitles_to_video(final_no_subs, subs, final_with_subs) return final_with_subs # ────────────────────────────────────────────────────────────────────────────── # Gradio UI – one tab per function # ────────────────────────────────────────────────────────────────────────────── demo = gr.Blocks(theme="gradio/soft") with demo: gr.Markdown("# 🎬 Modular AI Video Toolkit") gr.Markdown( "Each tab exposes **one single processing step** so you can mix & match them as you like. 💡" ) # Shared state across tabs script_state = gr.State("") voice_state = gr.State("") montage_state = gr.State("") final_no_subs_state = gr.State("") # ───────────────────────── Script generation ───────────────────────── with gr.Tab("1️⃣ Generate Script"): with gr.Row(): context_in = gr.Textbox(label="🧠 Context", lines=4) instruction_in = gr.Textbox(label="🎯 Instruction", lines=4) duration_slider = gr.Slider(5, 120, 1, 60, label="⏱️ Target duration (s)") script_mode = gr.Radio([ "Generate script with AI", "Use my script", ], value="Generate script with AI", label="Script mode") custom_script_in = gr.Textbox(label="✍️ My script", lines=8, interactive=False) def _toggle(mode): return gr.update(interactive=(mode == "Use my script")) script_mode.change(_toggle, inputs=script_mode, outputs=custom_script_in) gen_script_btn = gr.Button("📝 Create Script", variant="primary") script_out = gr.Textbox(label="Script", lines=8, interactive=False) title_out = gr.Textbox(label="Title", lines=1, interactive=False) desc_out = gr.Textbox(label="Description", lines=3, interactive=False) gen_script_btn.click( cb_generate_script, [context_in, instruction_in, duration_slider, script_mode, custom_script_in], [script_out, title_out, desc_out, script_state], ) # ───────────────────────── Voice generation ───────────────────────── with gr.Tab("2️⃣ Generate Voice"): script_in_voice = gr.Textbox(label="Script (paste or use from previous step)", lines=8) gen_voice_btn = gr.Button("🔈 Synthesize Voice", variant="primary") voice_audio = gr.Audio(label="Generated voice", interactive=False) gen_voice_btn.click( cb_generate_voice, inputs=[script_in_voice], outputs=[voice_audio, voice_state], ) # Auto‑populate script textbox with state when it updates script_state.change(lambda s: s, script_state, script_in_voice, queue=False) # ───────────────────────── Montage creation ───────────────────────── with gr.Tab("3️⃣ Create Montage"): videos_dropzone = gr.Files(label="🎞️ Background videos (MP4)", file_types=[".mp4"], type="filepath") videos_state = gr.State([]) videos_dropzone.upload(accumulate_files, [videos_dropzone, videos_state], videos_state, queue=False) videos_display = gr.Textbox(label="Selected videos", interactive=False) videos_state.change(lambda s: "\n".join(os.path.basename(f) for f in s), videos_state, videos_display, queue=False) with gr.Accordion("🎨 Visual settings", open=False): lum_slider = gr.Slider(0, 20, 6, step=0.5, label="Brightness (0–20)") contrast_slider = gr.Slider(0.5, 2.0, 1.0, step=0.05, label="Contrast (0.5–2.0)") gamma_slider = gr.Slider(0.5, 2.0, 1.0, step=0.05, label="Gamma (0.5–2.0)") show_bar = gr.Checkbox(label="Show progress bar", value=True) create_montage_btn = gr.Button("🎞️ Build Montage", variant="primary") montage_video = gr.Video(label="Montage Preview") create_montage_btn.click( cb_create_montage, [videos_state, voice_state, lum_slider, contrast_slider, gamma_slider, show_bar], [montage_video, montage_state], ) # ───────────────────────── Mixing (voice + music) ───────────────────────── with gr.Tab("4️⃣ Mix Audio / Video"): voice_in = gr.File(label="Narration MP3 (optional – leave empty to use state)", file_types=[".mp3"], type="filepath") montage_in = gr.File(label="Montage MP4 (optional – leave empty to use state)", file_types=[".mp4"], type="filepath") music_in = gr.File(label="Background music (MP3 – optional)", file_types=[".mp3"], type="filepath") def _use_state(file, state): return file if file else state mix_btn = gr.Button("🎚️ Mix", variant="primary") final_no_subs_vid = gr.Video(label="Mixed video (no subtitles)") mix_btn.click( lambda montage, voice, music, montage_state_val, voice_state_val: cb_mix_audio( _use_state(montage, montage_state_val), _use_state(voice, voice_state_val), music, ), [montage_in, voice_in, music_in, montage_state, voice_state], [final_no_subs_vid, final_no_subs_state], ) # ───────────────────────── Subtitles ───────────────────────── with gr.Tab("5️⃣ Add Subtitles"): video_in_sub = gr.File(label="Video MP4 (optional – defaults to last mixed video)", type="filepath", file_types=[".mp4"]) voice_in_sub = gr.File(label="Narration MP3 (optional – defaults to last generated voice)", type="filepath", file_types=[".mp3"]) add_subs_btn = gr.Button("🔤 Add Subtitles", variant="primary") final_subs_video = gr.Video(label="Final video with subtitles") add_subs_btn.click( lambda v_in, a_in, v_state, a_state: cb_add_subtitles( v_in if v_in else v_state, a_in if a_in else a_state, ), [video_in_sub, voice_in_sub, final_no_subs_state, voice_state], final_subs_video, ) # Startup demo.launch()