import gradio as gr import os import torch from transformers import ( AutoTokenizer, AutoModelForCausalLM, pipeline, AutoProcessor, MusicgenForConditionalGeneration, ) from scipy.io.wavfile import write from pydub import AudioSegment from dotenv import load_dotenv import tempfile import spaces # Coqui TTS from TTS.api import TTS # --------------------------------------------------------------------- # Load Environment Variables # --------------------------------------------------------------------- load_dotenv() HF_TOKEN = os.getenv("HF_TOKEN") # --------------------------------------------------------------------- # Global Model Caches # --------------------------------------------------------------------- LLAMA_PIPELINES = {} MUSICGEN_MODELS = {} TTS_MODELS = {} # --------------------------------------------------------------------- # Helper Functions # --------------------------------------------------------------------- def get_llama_pipeline(model_id: str, token: str): """ Returns a cached LLaMA pipeline if available; otherwise, loads it. """ if model_id in LLAMA_PIPELINES: return LLAMA_PIPELINES[model_id] tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token) model = AutoModelForCausalLM.from_pretrained( model_id, use_auth_token=token, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, ) text_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer) LLAMA_PIPELINES[model_id] = text_pipeline return text_pipeline def get_musicgen_model(model_key: str = "facebook/musicgen-large"): """ Returns a cached MusicGen model if available; otherwise, loads it. Uses the 'large' variant for higher quality outputs. """ if model_key in MUSICGEN_MODELS: return MUSICGEN_MODELS[model_key] model = MusicgenForConditionalGeneration.from_pretrained(model_key) processor = AutoProcessor.from_pretrained(model_key) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) MUSICGEN_MODELS[model_key] = (model, processor) return model, processor def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"): """ Returns a cached TTS model if available; otherwise, loads it. """ if model_name in TTS_MODELS: return TTS_MODELS[model_name] tts_model = TTS(model_name) TTS_MODELS[model_name] = tts_model return tts_model # --------------------------------------------------------------------- # Script Generation Function # --------------------------------------------------------------------- @spaces.GPU(duration=100) def generate_script(user_prompt: str, model_id: str, token: str, duration: int): """ Generates a script, sound design suggestions, and music ideas from a user prompt. Returns a tuple of strings: (voice_script, sound_design, music_suggestions). """ try: text_pipeline = get_llama_pipeline(model_id, token) system_prompt = ( "You are an expert radio imaging producer specializing in sound design and music. " f"Based on the user's concept and the selected duration of {duration} seconds, produce the following: " "1. A concise voice-over script. Prefix this section with 'Voice-Over Script:'.\n" "2. Suggestions for sound design. Prefix this section with 'Sound Design Suggestions:'.\n" "3. Music styles or track recommendations. Prefix this section with 'Music Suggestions:'." ) combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nOutput:" with torch.inference_mode(): result = text_pipeline( combined_prompt, max_new_tokens=300, do_sample=True, temperature=0.8 ) generated_text = result[0]["generated_text"] if "Output:" in generated_text: generated_text = generated_text.split("Output:")[-1].strip() # Default placeholders voice_script = "No voice-over script found." sound_design = "No sound design suggestions found." music_suggestions = "No music suggestions found." # Voice-Over Script if "Voice-Over Script:" in generated_text: parts = generated_text.split("Voice-Over Script:") voice_script_part = parts[1] if "Sound Design Suggestions:" in voice_script_part: voice_script = voice_script_part.split("Sound Design Suggestions:")[0].strip() else: voice_script = voice_script_part.strip() # Sound Design if "Sound Design Suggestions:" in generated_text: parts = generated_text.split("Sound Design Suggestions:") sound_design_part = parts[1] if "Music Suggestions:" in sound_design_part: sound_design = sound_design_part.split("Music Suggestions:")[0].strip() else: sound_design = sound_design_part.strip() # Music Suggestions if "Music Suggestions:" in generated_text: parts = generated_text.split("Music Suggestions:") music_suggestions = parts[1].strip() return voice_script, sound_design, music_suggestions except Exception as e: return f"Error generating script: {e}", "", "" # --------------------------------------------------------------------- # Voice-Over Generation Function # --------------------------------------------------------------------- @spaces.GPU(duration=100) def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"): """ Generates a voice-over from the provided script using the Coqui TTS model. Returns the file path to the generated .wav file. """ try: if not script.strip(): return "Error: No script provided." tts_model = get_tts_model(tts_model_name) # Generate and save voice output_path = os.path.join(tempfile.gettempdir(), "voice_over.wav") tts_model.tts_to_file(text=script, file_path=output_path) return output_path except Exception as e: return f"Error generating voice: {e}" # --------------------------------------------------------------------- # Music Generation Function # --------------------------------------------------------------------- @spaces.GPU(duration=100) def generate_music(prompt: str, audio_length: int): """ Generates music from the 'facebook/musicgen-large' model based on the prompt. Returns the file path to the generated .wav file. """ try: if not prompt.strip(): return "Error: No music suggestion provided." model_key = "facebook/musicgen-large" musicgen_model, musicgen_processor = get_musicgen_model(model_key) device = "cuda" if torch.cuda.is_available() else "cpu" inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device) with torch.inference_mode(): outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length) audio_data = outputs[0, 0].cpu().numpy() normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16") output_path = f"{tempfile.gettempdir()}/musicgen_large_generated_music.wav" write(output_path, 44100, normalized_audio) return output_path except Exception as e: return f"Error generating music: {e}" # --------------------------------------------------------------------- # Audio Blending with Duration Sync & Ducking # --------------------------------------------------------------------- @spaces.GPU(duration=100) def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10): """ Blends two audio files (voice and music). 1. If music < voice, loops the music until it meets/exceeds the voice duration. 2. If music > voice, trims music to the voice duration. 3. If ducking=True, the music is attenuated by 'duck_level' dB while the voice is playing. Returns the file path to the blended .wav file. """ try: if not os.path.isfile(voice_path) or not os.path.isfile(music_path): return "Error: Missing audio files for blending." voice = AudioSegment.from_wav(voice_path) music = AudioSegment.from_wav(music_path) voice_len = len(voice) # in milliseconds music_len = len(music) # in milliseconds # 1) If the music is shorter than the voice, loop it: if music_len < voice_len: looped_music = AudioSegment.empty() # Keep appending until we exceed voice length while len(looped_music) < voice_len: looped_music += music music = looped_music # 2) If the music is longer than the voice, truncate it: if len(music) > voice_len: music = music[:voice_len] # Now music and voice are the same length if ducking: # Step 1: Reduce music dB while voice is playing ducked_music = music - duck_level # Step 2: Overlay voice on top of ducked music final_audio = ducked_music.overlay(voice) else: # No ducking, just overlay final_audio = music.overlay(voice) output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav") final_audio.export(output_path, format="wav") return output_path except Exception as e: return f"Error blending audio: {e}" # --------------------------------------------------------------------- # Gradio Interface # --------------------------------------------------------------------- with gr.Blocks() as demo: gr.Markdown(""" # 🎧 AI Promo Studio Welcome to **AI Promo Studio**, your all-in-one solution for creating professional, engaging audio promos with minimal effort! This next-generation platform uses powerful AI models to handle: - **Script Generation**: Craft concise and impactful copy with LLaMA. - **Voice Synthesis**: Convert text into natural-sounding voice-overs using Coqui TTS. - **Music Production**: Generate custom music tracks with MusicGen Large for sound bed. - **Seamless Blending**: Easily combine voice and music—loop or trim tracks to match your desired promo length, with optional ducking to keep the voice front and center. Whether you’re a radio producer, podcaster, or content creator, **AI Promo Studio** streamlines your entire production pipeline—cutting hours of manual editing down to a few clicks. """) with gr.Tabs(): # Step 1: Generate Script with gr.Tab("Step 1: Generate Script"): with gr.Row(): user_prompt = gr.Textbox( label="Promo Idea", placeholder="E.g., A 30-second promo for a morning show...", lines=2 ) llama_model_id = gr.Textbox( label="LLaMA Model ID", value="meta-llama/Meta-Llama-3-8B-Instruct", placeholder="Enter a valid Hugging Face model ID" ) duration = gr.Slider( label="Desired Promo Duration (seconds)", minimum=15, maximum=60, step=15, value=30 ) generate_script_button = gr.Button("Generate Script") script_output = gr.Textbox(label="Generated Voice-Over Script", lines=5, interactive=False) sound_design_output = gr.Textbox(label="Sound Design Suggestions", lines=3, interactive=False) music_suggestion_output = gr.Textbox(label="Music Suggestions", lines=3, interactive=False) generate_script_button.click( fn=lambda user_prompt, model_id, dur: generate_script(user_prompt, model_id, HF_TOKEN, dur), inputs=[user_prompt, llama_model_id, duration], outputs=[script_output, sound_design_output, music_suggestion_output], ) # Step 2: Generate Voice with gr.Tab("Step 2: Generate Voice"): gr.Markdown("Generate the voice-over using a Coqui TTS model.") selected_tts_model = gr.Dropdown( label="TTS Model", choices=[ "tts_models/en/ljspeech/tacotron2-DDC", "tts_models/en/ljspeech/vits", "tts_models/en/sam/tacotron-DDC", ], value="tts_models/en/ljspeech/tacotron2-DDC", multiselect=False ) generate_voice_button = gr.Button("Generate Voice-Over") voice_audio_output = gr.Audio(label="Voice-Over (WAV)", type="filepath") generate_voice_button.click( fn=lambda script, tts_model: generate_voice(script, tts_model), inputs=[script_output, selected_tts_model], outputs=voice_audio_output, ) # Step 3: Generate Music (MusicGen Large) with gr.Tab("Step 3: Generate Music"): gr.Markdown("Generate a music track with the **MusicGen Large** model.") audio_length = gr.Slider( label="Music Length (tokens)", minimum=128, maximum=1024, step=64, value=512, info="Increase tokens for longer audio, but be mindful of inference time." ) generate_music_button = gr.Button("Generate Music") music_output = gr.Audio(label="Generated Music (WAV)", type="filepath") generate_music_button.click( fn=lambda music_suggestion, length: generate_music(music_suggestion, length), inputs=[music_suggestion_output, audio_length], outputs=[music_output], ) # Step 4: Blend Audio (Loop/Trim + Ducking) with gr.Tab("Step 4: Blend Audio"): gr.Markdown("**Music** will be looped or trimmed to match **Voice** duration, then optionally ducked.") ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True) duck_level_slider = gr.Slider( label="Ducking Level (dB attenuation)", minimum=0, maximum=20, step=1, value=10 ) blend_button = gr.Button("Blend Voice + Music") blended_output = gr.Audio(label="Final Blended Output (WAV)", type="filepath") blend_button.click( fn=blend_audio, inputs=[voice_audio_output, music_output, ducking_checkbox, duck_level_slider], outputs=blended_output ) # Footer gr.Markdown("""
Created with ❤️ by bilsimaging.com
""") # Visitor Badge gr.HTML("""