AIPromoStudio / app.py
Bils's picture
Update app.py
621eae6 verified
raw
history blame
10.9 kB
import streamlit as st
import torch
import scipy.io.wavfile
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
pipeline,
AutoProcessor,
MusicgenForConditionalGeneration
)
# ---------------------------------------------------------------------
# Page Configuration
# ---------------------------------------------------------------------
st.set_page_config(
page_icon="🎧",
layout="wide",
page_title="Radio Imaging Audio Generator - Llama 3",
initial_sidebar_state="expanded",
)
# ---------------------------------------------------------------------
# Custom CSS for a Catchy UI
# ---------------------------------------------------------------------
CUSTOM_CSS = """
<style>
body {
background-color: #FAFCFF;
color: #1F2937;
font-family: 'Segoe UI', Tahoma, sans-serif;
}
h1, h2, h3, h4, h5, h6 {
color: #3B82F6;
margin-bottom: 0.5em;
}
.stButton>button {
background-color: #3B82F6 !important;
color: #FFFFFF !important;
border-radius: 8px !important;
font-size: 16px !important;
margin: 0.5em 0;
}
.sidebar .sidebar-content {
background: #E0F2FE;
}
.material-card {
border: 1px solid #D1D5DB;
border-radius: 8px;
padding: 1rem;
margin-bottom: 1rem;
background-color: #ffffff;
}
.footer-note {
text-align: center;
opacity: 0.6;
font-size: 14px;
margin-top: 30px;
}
</style>
"""
st.markdown(CUSTOM_CSS, unsafe_allow_html=True)
# ---------------------------------------------------------------------
# Header Section
# ---------------------------------------------------------------------
st.markdown(
"""
<h1>πŸŽ™ Radio Imaging Audio Generator <span style="font-size: 24px; color: #F59E0B;">(Beta with Llama 3)</span></h1>
<p style='font-size:18px;'>
Generate custom radio ads, station promos, and jingles in multiple languages
using the **hypothetical Llama 3.3** Instruct model & MusicGen!
</p>
""",
unsafe_allow_html=True
)
st.markdown("---")
# ---------------------------------------------------------------------
# Instructions Section
# ---------------------------------------------------------------------
with st.expander("πŸ“˜ How to Use This Web App"):
st.markdown(
"""
1. **Enter a concept** in any language: Describe the style, mood, length, etc.
2. **Choose Language**: If you want a Spanish script, select Spanish below (multi-language).
3. **Refine with Llama 3**: Let the model transform your brief into a catchy script.
4. **Set Audio Options**: Choose a style (Rock, Pop, Classical...) and max tokens for MusicGen output.
5. **Generate Audio**: Listen & optionally download or upload the WAV file.
**Future Enhancements**:
- **User Authentication**: Restrict access or track usage with logins.
- **Advanced Fine-tuning**: Adjust Llama or MusicGen for specialized station branding.
- **Cloud Storage**: Upload final WAVs to a server or cloud bucket for easy sharing.
"""
)
# ---------------------------------------------------------------------
# Sidebar: Model Selection & Options
# ---------------------------------------------------------------------
with st.sidebar:
st.header("πŸ”§ Model & Audio Config")
# Llama 3 model ID on Hugging Face (hypothetical)
llama_model_id = st.text_input(
"Llama 3 Instruct Model ID",
value="meta-llama/Llama-3.3-70B-Instruct",
help="Requires license acceptance on Hugging Face, if/when available."
)
device_option = st.selectbox(
"Hardware Device",
["auto", "cpu"],
help="If running locally with a GPU, choose 'auto'. CPU-only might be slow for large models."
)
st.markdown("---")
# Multi-language prompt
language = st.selectbox(
"Choose Output Language",
["English", "Spanish", "French", "German", "Other (explain in your prompt)"]
)
st.markdown("---")
# Audio style and tokens
music_style = st.selectbox(
"Preferred Music Style",
["Pop", "Rock", "Electronic", "Classical", "Hip-Hop", "Reggae", "Ambient", "Other"]
)
audio_tokens = st.slider(
"MusicGen Max Tokens (Approx. Track Length)",
min_value=128, max_value=1024, value=512, step=64
)
# ---------------------------------------------------------------------
# Prompt Input
# ---------------------------------------------------------------------
st.markdown("## ✍🏻 Write Your Concept Brief")
prompt = st.text_area(
"Describe the radio imaging or jingle you want to create.",
placeholder="e.g. 'An energetic 15-second pop jingle in Spanish for a morning radio show...'"
)
# ---------------------------------------------------------------------
# Text Generation with Llama 3
# ---------------------------------------------------------------------
@st.cache_resource
def load_llama_pipeline(model_id: str, device: str):
"""
Load the Llama or other open-source model as a text-generation pipeline.
This is hypothetical for Llama 3.3.
Must accept license on HF if the model is restricted.
"""
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16 if device == "auto" else torch.float32,
device_map=device
)
gen_pipeline = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device_map=device
)
return gen_pipeline
def generate_description(user_prompt: str, pipeline_gen, language_choice: str):
"""
Use the pipeline to create a refined description for MusicGen,
with multi-language capabilities.
"""
# Instruction for Llama (system prompt):
system_prompt = (
"You are a creative ad copywriter specialized in radio imaging. "
"Refine the user's concept into a concise script. "
"Incorporate the language choice and creative elements for a promotional audio spot."
)
# Combine user prompt + language + the system instructions
combined_prompt = (
f"{system_prompt}\n"
f"Language to use: {language_choice}\n"
f"User Concept: {user_prompt}\n"
f"Your refined ad script:"
)
result = pipeline_gen(
combined_prompt,
max_new_tokens=300,
do_sample=True,
temperature=0.8
)
generated_text = result[0]["generated_text"]
# Attempt to isolate the script portion
if "script:" in generated_text.lower():
generated_text = generated_text.split("script:", 1)[-1].strip()
# Add a sign-off or brand line
generated_text += "\n\n(Generated by Radio Imaging Audio Generator - Powered by Llama 3)"
return generated_text
# Button: Generate Description
if st.button("πŸ“„ Refine Description with Llama 3"):
if not prompt.strip():
st.error("Please provide a concept before generating a description.")
else:
with st.spinner("Generating a refined description..."):
try:
pipeline_llama = load_llama_pipeline(llama_model_id, device_option)
refined_text = generate_description(prompt, pipeline_llama, language)
st.session_state['refined_prompt'] = refined_text
st.success("Description successfully refined!")
st.write(refined_text)
st.download_button(
"πŸ“₯ Download Description",
refined_text,
file_name="refined_description.txt"
)
except Exception as e:
st.error(f"Error while generating with Llama 3: {e}")
st.markdown("---")
# ---------------------------------------------------------------------
# MusicGen: Generate Audio
# ---------------------------------------------------------------------
@st.cache_resource
def load_musicgen_model():
"""Load and cache the MusicGen model and processor."""
mg_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
mg_processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
return mg_model, mg_processor
if st.button("β–Ά Generate Audio with MusicGen"):
if 'refined_prompt' not in st.session_state or not st.session_state['refined_prompt']:
st.error("Please generate or have a refined script before creating audio.")
else:
descriptive_text = st.session_state['refined_prompt']
with st.spinner("Generating your audio..."):
try:
musicgen_model, processor = load_musicgen_model()
# Incorporate the style preference into the final text
final_text_for_music = f"{descriptive_text}\nStyle preference: {music_style}"
# Use the refined prompt + style as input
inputs = processor(
text=[final_text_for_music],
padding=True,
return_tensors="pt"
)
# Adjust max_new_tokens for track length
audio_values = musicgen_model.generate(**inputs, max_new_tokens=audio_tokens)
sampling_rate = musicgen_model.config.audio_encoder.sampling_rate
# Save & display the audio
audio_filename = f"radio_imaging_output_{music_style.lower()}.wav"
scipy.io.wavfile.write(
audio_filename,
rate=sampling_rate,
data=audio_values[0, 0].numpy()
)
st.success("Audio successfully generated!")
st.audio(audio_filename)
# Optionally, prompt to "Upload to Cloud" or "Save to Directory"
if st.checkbox("Upload this WAV to cloud storage? (Demo)"):
with st.spinner("Uploading... (This is a placeholder)"):
# Pseudocode for your custom logic, e.g.:
# upload_to_s3(audio_filename, bucket_name="radio-imaging-bucket")
st.success("File uploaded to your cloud storage (placeholder).")
except Exception as e:
st.error(f"Error while generating audio: {e}")
# ---------------------------------------------------------------------
# Footer Section
# ---------------------------------------------------------------------
st.markdown("---")
st.markdown(
"<div class='footer-note'>"
"βœ… Built with a hypothetical Llama 3.3 & MusicGen Β· "
"Multi-language, advanced styles, and a hint of future expansions Β· "
"Happy producing!"
"</div>",
unsafe_allow_html=True
)
# Hide Streamlit's default menu and footer if you wish
st.markdown("<style>#MainMenu {visibility: hidden;} footer {visibility: hidden;}</style>", unsafe_allow_html=True)