import os import re import logging import torch import tempfile from typing import Tuple, Union from scipy.io.wavfile import write from pydub import AudioSegment from dotenv import load_dotenv import spaces import gradio as gr # Transformers & Models from transformers import ( AutoTokenizer, AutoModelForCausalLM, pipeline, AutoProcessor, MusicgenForConditionalGeneration, ) # Coqui TTS from TTS.api import TTS # Kokoro TTS (ensure these are installed) # pip install -q kokoro>=0.8.2 soundfile # apt-get -qq -y install espeak-ng > /dev/null 2>&1 from kokoro import KPipeline import soundfile as sf # --------------------------------------------------------------------- # Configuration & Logging Setup # --------------------------------------------------------------------- load_dotenv() HF_TOKEN = os.getenv("HF_TOKEN") if not HF_TOKEN: logging.warning("HF_TOKEN environment variable not set!") logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") # Global Model Caches LLAMA_PIPELINES = {} MUSICGEN_MODELS = {} TTS_MODELS = {} # --------------------------------------------------------------------- # Utility Functions # --------------------------------------------------------------------- def clean_text(text: str) -> str: """ Clean text by removing undesired characters. Args: text (str): Input text to be cleaned. Returns: str: Cleaned text. """ # Remove all asterisks. Additional cleaning rules can be added. return re.sub(r'\*', '', text) # --------------------------------------------------------------------- # Model Loading Helper Functions # --------------------------------------------------------------------- def get_llama_pipeline(model_id: str, token: str) -> pipeline: """ Load and cache the LLaMA text-generation pipeline. Args: model_id (str): Hugging Face model identifier. token (str): Hugging Face authentication token. Returns: pipeline: Text-generation pipeline instance. """ if model_id in LLAMA_PIPELINES: return LLAMA_PIPELINES[model_id] try: tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token) model = AutoModelForCausalLM.from_pretrained( model_id, use_auth_token=token, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, ) text_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer) LLAMA_PIPELINES[model_id] = text_pipeline return text_pipeline except Exception as e: logging.error(f"Error loading LLaMA pipeline: {e}") raise def get_musicgen_model(model_key: str = "facebook/musicgen-large") -> Tuple[MusicgenForConditionalGeneration, AutoProcessor]: """ Load and cache the MusicGen model and its processor. Args: model_key (str): Model key (default uses 'facebook/musicgen-large'). Returns: tuple: (MusicGen model, processor) """ if model_key in MUSICGEN_MODELS: return MUSICGEN_MODELS[model_key] try: model = MusicgenForConditionalGeneration.from_pretrained(model_key) processor = AutoProcessor.from_pretrained(model_key) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) MUSICGEN_MODELS[model_key] = (model, processor) return model, processor except Exception as e: logging.error(f"Error loading MusicGen model: {e}") raise def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC") -> TTS: """ Load and cache the TTS model. Args: model_name (str): Name of the TTS model. Returns: TTS: TTS model instance. """ if model_name in TTS_MODELS: return TTS_MODELS[model_name] try: tts_model = TTS(model_name) TTS_MODELS[model_name] = tts_model return tts_model except Exception as e: logging.error(f"Error loading TTS model: {e}") raise # --------------------------------------------------------------------- # Script Generation Function # --------------------------------------------------------------------- @spaces.GPU(duration=100) def generate_script(user_prompt: str, model_id: str, token: str, duration: int) -> Tuple[str, str, str]: """ Generate a script, sound design suggestions, and music ideas from a user prompt. Args: user_prompt (str): The user's creative input. model_id (str): Hugging Face model identifier for LLaMA. token (str): Hugging Face authentication token. duration (int): Desired duration of the promo in seconds. Returns: tuple: (voice_script, sound_design, music_suggestions) """ try: text_pipeline = get_llama_pipeline(model_id, token) system_prompt = ( "You are an expert radio imaging producer specializing in sound design and music. " f"Based on the user's concept and the selected duration of {duration} seconds, produce the following:\n" "1. A concise voice-over script. Prefix this section with 'Voice-Over Script:'\n" "2. Suggestions for sound design. Prefix this section with 'Sound Design Suggestions:'\n" "3. Music styles or track recommendations. Prefix this section with 'Music Suggestions:'" ) combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nOutput:" with torch.inference_mode(): result = text_pipeline( combined_prompt, max_new_tokens=300, do_sample=True, temperature=0.8 ) generated_text = result[0]["generated_text"] # Remove everything before the 'Output:' marker if present if "Output:" in generated_text: generated_text = generated_text.split("Output:")[-1].strip() # Initialize default outputs voice_script = "No voice-over script found." sound_design = "No sound design suggestions found." music_suggestions = "No music suggestions found." # Parse generated text based on expected prefixes if "Voice-Over Script:" in generated_text: voice_section = generated_text.split("Voice-Over Script:")[1] if "Sound Design Suggestions:" in voice_section: voice_script = voice_section.split("Sound Design Suggestions:")[0].strip() else: voice_script = voice_section.strip() if "Sound Design Suggestions:" in generated_text: sound_section = generated_text.split("Sound Design Suggestions:")[1] if "Music Suggestions:" in sound_section: sound_design = sound_section.split("Music Suggestions:")[0].strip() else: sound_design = sound_section.strip() if "Music Suggestions:" in generated_text: music_suggestions = generated_text.split("Music Suggestions:")[-1].strip() return voice_script, sound_design, music_suggestions except Exception as e: logging.error(f"Error in generate_script: {e}") return f"Error generating script: {e}", "", "" # --------------------------------------------------------------------- # Voice-Over Generation Functions # --------------------------------------------------------------------- @spaces.GPU(duration=100) def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC") -> Union[str, None]: """ Generate a voice-over audio file using Coqui TTS from the provided script. Args: script (str): The voice-over script. tts_model_name (str): TTS model identifier. Returns: str: File path to the generated .wav file or an error message. """ try: if not script.strip(): raise ValueError("No script provided.") cleaned_script = clean_text(script) tts_model = get_tts_model(tts_model_name) output_path = os.path.join(tempfile.gettempdir(), "voice_over_coqui.wav") tts_model.tts_to_file(text=cleaned_script, file_path=output_path) logging.info(f"Coqui voice-over generated at {output_path}") return output_path except Exception as e: logging.error(f"Error in generate_voice (Coqui TTS): {e}") return f"Error generating voice: {e}" @spaces.GPU(duration=100) def generate_voice_kokoro(script: str, lang_code: str = 'a', voice: str = 'af_heart', speed: float = 1.0) -> Union[str, None]: """ Generate a voice-over audio file using the Kokoro TTS model. Args: script (str): The text to synthesize. lang_code (str): Language code ('a' for American English, etc.). voice (str): Specific voice style. speed (float): Speech speed. Returns: str: File path to the generated WAV file or an error message. """ try: # Initialize the Kokoro pipeline kp = KPipeline(lang_code=lang_code) audio_segments = [] generator = kp(script, voice=voice, speed=speed, split_pattern=r'\n+') for i, (gs, ps, audio) in enumerate(generator): audio_segments.append(audio) # Join audio segments using pydub combined = AudioSegment.empty() for seg in audio_segments: segment = AudioSegment( seg.tobytes(), frame_rate=24000, sample_width=seg.dtype.itemsize, channels=1 ) combined += segment output_path = os.path.join(tempfile.gettempdir(), "voice_over_kokoro.wav") combined.export(output_path, format="wav") logging.info(f"Kokoro voice-over generated at {output_path}") return output_path except Exception as e: logging.error(f"Error in generate_voice_kokoro: {e}") return f"Error generating Kokoro voice: {e}" # --------------------------------------------------------------------- # Music Generation Function # --------------------------------------------------------------------- @spaces.GPU(duration=200) def generate_music(prompt: str, audio_length: int) -> Union[str, None]: """ Generate music based on the prompt using MusicGen. Args: prompt (str): Music prompt or style suggestion. audio_length (int): Length parameter (number of tokens). Returns: str: File path to the generated .wav file or an error message. """ try: if not prompt.strip(): raise ValueError("No music suggestion provided.") model_key = "facebook/musicgen-large" musicgen_model, musicgen_processor = get_musicgen_model(model_key) device = "cuda" if torch.cuda.is_available() else "cpu" inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device) with torch.inference_mode(): outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length) audio_data = outputs[0, 0].cpu().numpy() normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16") output_path = os.path.join(tempfile.gettempdir(), "musicgen_generated_music.wav") write(output_path, 44100, normalized_audio) logging.info(f"Music generated at {output_path}") return output_path except Exception as e: logging.error(f"Error in generate_music: {e}") return f"Error generating music: {e}" # --------------------------------------------------------------------- # Audio Blending Function # --------------------------------------------------------------------- @spaces.GPU(duration=100) def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10) -> Union[str, None]: """ Blend voice and music audio files with optional ducking. Args: voice_path (str): File path to the voice audio. music_path (str): File path to the music audio. ducking (bool): If True, attenuate music during voice segments. duck_level (int): Attenuation level in dB. Returns: str: File path to the blended .wav file or an error message. """ try: if not (os.path.isfile(voice_path) and os.path.isfile(music_path)): raise FileNotFoundError("Missing audio files for blending.") voice = AudioSegment.from_wav(voice_path) music = AudioSegment.from_wav(music_path) voice_duration = len(voice) if len(music) < voice_duration: looped_music = AudioSegment.empty() while len(looped_music) < voice_duration: looped_music += music music = looped_music else: music = music[:voice_duration] if ducking: ducked_music = music - duck_level final_audio = ducked_music.overlay(voice) else: final_audio = music.overlay(voice) output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav") final_audio.export(output_path, format="wav") logging.info(f"Audio blended at {output_path}") return output_path except Exception as e: logging.error(f"Error in blend_audio: {e}") return f"Error blending audio: {e}" # --------------------------------------------------------------------- # Gradio Interface with Enhanced UI # --------------------------------------------------------------------- with gr.Blocks(css=""" /* Global Styles */ body { background: linear-gradient(135deg, #1d1f21, #3a3d41); color: #f0f0f0; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; } .header { text-align: center; padding: 2rem 1rem; background: linear-gradient(90deg, #6a11cb, #2575fc); border-radius: 0 0 20px 20px; margin-bottom: 2rem; } .header h1 { margin: 0; font-size: 2.5rem; } .header p { font-size: 1.2rem; } .gradio-container { background: #2e2e2e; border-radius: 10px; padding: 1rem; } .tab-title { font-size: 1.1rem; font-weight: bold; } .footer { text-align: center; font-size: 0.9em; margin-top: 2rem; padding: 1rem; color: #cccccc; } """) as demo: # Custom Header with gr.Row(elem_classes="header"): gr.Markdown("""

🎧 AI Promo Studio

Your all-in-one AI solution for crafting engaging audio promos.

""") gr.Markdown(""" Welcome to **AI Promo Studio**! This platform leverages state-of-the-art AI models to help you generate: - **Script**: Generate a compelling voice-over script with LLaMA. - **Voice Synthesis**: Create natural-sounding voice-overs using Coqui TTS or Kokoro TTS. - **Music Production**: Produce custom music tracks with MusicGen. - **Audio Blending**: Seamlessly blend voice and music with options for ducking. """) with gr.Tabs(): # Step 1: Generate Script with gr.Tab("📝 Script Generation"): with gr.Row(): user_prompt = gr.Textbox( label="Promo Idea", placeholder="E.g., A 30-second promo for a morning show...", lines=2 ) with gr.Row(): llama_model_id = gr.Textbox( label="LLaMA Model ID", value="meta-llama/Meta-Llama-3-8B-Instruct", placeholder="Enter a valid Hugging Face model ID" ) duration = gr.Slider( label="Desired Promo Duration (seconds)", minimum=15, maximum=60, step=15, value=30 ) generate_script_button = gr.Button("Generate Script", variant="primary") script_output = gr.Textbox(label="Generated Voice-Over Script", lines=5, interactive=False) sound_design_output = gr.Textbox(label="Sound Design Suggestions", lines=3, interactive=False) music_suggestion_output = gr.Textbox(label="Music Suggestions", lines=3, interactive=False) generate_script_button.click( fn=lambda prompt, model, dur: generate_script(prompt, model, HF_TOKEN, dur), inputs=[user_prompt, llama_model_id, duration], outputs=[script_output, sound_design_output, music_suggestion_output], ) # Step 2: Generate Voice with gr.Tab("🎤 Voice Synthesis"): gr.Markdown("Generate a natural-sounding voice-over. Choose your TTS engine below:") voice_engine = gr.Dropdown( label="TTS Engine", choices=["Coqui TTS", "Kokoro TTS"], value="Coqui TTS", multiselect=False ) selected_tts_model = gr.Dropdown( label="TTS Model / Voice Option", choices=[ "tts_models/en/ljspeech/tacotron2-DDC", # Coqui TTS option "tts_models/en/ljspeech/vits", # Coqui TTS option "af_heart" # Kokoro TTS voice option ], value="tts_models/en/ljspeech/tacotron2-DDC", multiselect=False ) generate_voice_button = gr.Button("Generate Voice-Over", variant="primary") voice_audio_output = gr.Audio(label="Voice-Over (WAV)", type="filepath") def generate_voice_combined(script, engine, model_choice): if engine == "Coqui TTS": return generate_voice(script, model_choice) elif engine == "Kokoro TTS": # For Kokoro, pass the voice option (e.g., "af_heart") and default language code ('a') return generate_voice_kokoro(script, lang_code='a', voice=model_choice, speed=1.0) else: return "Error: Unknown TTS engine." generate_voice_button.click( fn=generate_voice_combined, inputs=[script_output, voice_engine, selected_tts_model], outputs=voice_audio_output, ) # Step 3: Generate Music with gr.Tab("🎶 Music Production"): gr.Markdown("Generate a custom music track using the **MusicGen Large** model.") audio_length = gr.Slider( label="Music Length (tokens)", minimum=128, maximum=1024, step=64, value=512, info="Increase tokens for longer audio (inference time may vary)." ) generate_music_button = gr.Button("Generate Music", variant="primary") music_output = gr.Audio(label="Generated Music (WAV)", type="filepath") generate_music_button.click( fn=lambda prompt, length: generate_music(prompt, length), inputs=[music_suggestion_output, audio_length], outputs=[music_output], ) # Step 4: Blend Audio with gr.Tab("🎚️ Audio Blending"): gr.Markdown("Blend your voice-over and music track. Music will be looped/truncated to match the voice duration. Enable ducking to lower the music during voice segments.") ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True) duck_level_slider = gr.Slider( label="Ducking Level (dB attenuation)", minimum=0, maximum=20, step=1, value=10 ) blend_button = gr.Button("Blend Voice + Music", variant="primary") blended_output = gr.Audio(label="Final Blended Output (WAV)", type="filepath") blend_button.click( fn=blend_audio, inputs=[voice_audio_output, music_output, ducking_checkbox, duck_level_slider], outputs=blended_output ) # Footer gr.Markdown(""" """) # Visitor Badge gr.HTML("""
visitor badge
""") demo.launch(debug=True)