Spaces:
Running
Running
File size: 6,902 Bytes
17d10a7 a15d204 d448add db46bfb 1c1b50f db46bfb 1c1b50f db8ba25 db46bfb cf3593c d9bf0f0 b950350 d9bf0f0 b950350 3168a3e ecc69bf cf3593c 1c1b50f b950350 1c1b50f ecc69bf b950350 dfa5d3e db8ba25 dfa5d3e b950350 3168a3e 60b6e41 b950350 db8ba25 b950350 dfa5d3e b950350 dfa5d3e b950350 dfa5d3e 3b58485 b950350 17d10a7 db8ba25 a3b5047 8b6a33e cf3593c 17d10a7 a3b5047 b950350 d9bf0f0 b950350 d9bf0f0 6f08234 cf3593c b950350 d448add dfa5d3e b950350 dfa5d3e b950350 ecc69bf b950350 d9bf0f0 b950350 d9bf0f0 b950350 d9bf0f0 b950350 d9bf0f0 b950350 ecc69bf b950350 a3b5047 b950350 db8ba25 b950350 d9bf0f0 b950350 a3b5047 5080bd7 b950350 ecc69bf b950350 07c07fa b950350 d9bf0f0 b950350 3fe530b a8c9cb5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import gradio as gr
import os
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
pipeline,
AutoProcessor,
MusicgenForConditionalGeneration,
)
from scipy.io.wavfile import write
from pydub import AudioSegment
from pydub.playback import play
import tempfile
from dotenv import load_dotenv
import spaces
# Load environment variables
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
# ---------------------------------------------------------------------
# Script Generation Function
# ---------------------------------------------------------------------
@spaces.GPU(duration=300)
def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
try:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
model = AutoModelForCausalLM.from_pretrained(
model_id,
use_auth_token=token,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
)
llama_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
system_prompt = (
f"You are an expert radio imaging producer specializing in sound design and music. "
f"Based on the user's concept and the selected duration of {duration} seconds, craft a concise, engaging promo script. "
f"Ensure the script fits within the time limit and suggest a matching music style that complements the theme."
)
combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nRefined script and music suggestion:"
result = llama_pipeline(combined_prompt, max_new_tokens=200, do_sample=True, temperature=0.9)
generated_text = result[0]["generated_text"].split("Refined script and music suggestion:")[-1].strip()
script, music_suggestion = generated_text.split("Music Suggestion:")
return script.strip(), music_suggestion.strip()
except Exception as e:
return f"Error generating script: {e}", None
# ---------------------------------------------------------------------
# Voice-Over Generation Function
# ---------------------------------------------------------------------
@spaces.GPU(duration=300)
def generate_voice(script: str, speaker: str):
try:
# Replace with your chosen TTS model
tts_model = "coqui/XTTS-v2"
processor = AutoProcessor.from_pretrained(tts_model)
model = AutoModelForCausalLM.from_pretrained(tts_model)
inputs = processor(script, return_tensors="pt")
speech = model.generate(**inputs)
output_path = f"{tempfile.gettempdir()}/generated_voice.wav"
write(output_path, 22050, speech.cpu().numpy())
return output_path
except Exception as e:
return f"Error generating voice-over: {e}"
# ---------------------------------------------------------------------
# Music Generation Function
# ---------------------------------------------------------------------
@spaces.GPU(duration=300)
def generate_music(prompt: str, audio_length: int):
try:
musicgen_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
musicgen_processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
device = "cuda" if torch.cuda.is_available() else "cpu"
musicgen_model.to(device)
inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device)
outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)
audio_data = outputs[0, 0].cpu().numpy()
normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
output_path = f"{tempfile.gettempdir()}/generated_music.wav"
write(output_path, 44100, normalized_audio)
return output_path
except Exception as e:
return f"Error generating music: {e}"
# ---------------------------------------------------------------------
# Audio Blending Function with Ducking
# ---------------------------------------------------------------------
def blend_audio(voice_path: str, music_path: str, ducking: bool):
try:
voice = AudioSegment.from_file(voice_path)
music = AudioSegment.from_file(music_path)
if ducking:
music = music - 10 # Lower music volume for ducking
combined = music.overlay(voice)
output_path = f"{tempfile.gettempdir()}/final_promo.wav"
combined.export(output_path, format="wav")
return output_path
except Exception as e:
return f"Error blending audio: {e}"
# ---------------------------------------------------------------------
# Gradio Interface
# ---------------------------------------------------------------------
def process_all(user_prompt, llama_model_id, duration, audio_length, speaker, ducking):
script, music_suggestion = generate_script(user_prompt, llama_model_id, hf_token, duration)
if "Error" in script:
return script, None
voice_path = generate_voice(script, speaker)
if "Error" in voice_path:
return voice_path, None
music_path = generate_music(music_suggestion, audio_length)
if "Error" in music_path:
return music_path, None
final_audio = blend_audio(voice_path, music_path, ducking)
return f"Script:\n{script}\n\nMusic Suggestion:\n{music_suggestion}", final_audio
with gr.Blocks() as demo:
gr.Markdown("""
# 🎧 AI Promo Studio with Script, Voice, Music, and Mixing 🚀
Generate fully mixed promos effortlessly with AI-driven tools for radio and media!
""")
with gr.Row():
user_prompt = gr.Textbox(label="Promo Idea", placeholder="E.g., A 30-second promo for a morning show.")
llama_model_id = gr.Textbox(label="Llama Model ID", value="meta-llama/Meta-Llama-3-8B-Instruct")
duration = gr.Slider(label="Duration (seconds)", minimum=15, maximum=60, step=15, value=30)
audio_length = gr.Slider(label="Music Length (tokens)", minimum=128, maximum=1024, step=64, value=512)
speaker = gr.Textbox(label="Voice Style (optional)", placeholder="E.g., male, female, or neutral.")
ducking = gr.Checkbox(label="Enable Ducking", value=True)
generate_button = gr.Button("Generate Full Promo")
script_output = gr.Textbox(label="Generated Script and Music Suggestion")
audio_output = gr.Audio(label="Final Promo Audio", type="filepath")
generate_button.click(
fn=process_all,
inputs=[user_prompt, llama_model_id, duration, audio_length, speaker, ducking],
outputs=[script_output, audio_output],
)
gr.Markdown("""
<hr>
<p style="text-align: center; font-size: 0.9em;">
Created with ❤️ by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a>
</p>
""")
demo.launch(debug=True)
|