gradio / app.py
glodov's picture
Audio generation.
3464b5a verified
"""Gradio app for Higgs Audio v2 on Hugging Face Spaces."""
import gradio as gr
import torch
import torchaudio
import soundfile as sf
import os
import tempfile
from boson_multimodal.serve.serve_engine import HiggsAudioServeEngine, HiggsAudioResponse
from boson_multimodal.data_types import ChatMLSample, Message, AudioContent
# Model and tokenizer paths
MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer"
# Initialize the engine once at startup
device = "cuda" if torch.cuda.is_available() else "cpu"
serve_engine = HiggsAudioServeEngine(MODEL_PATH, AUDIO_TOKENIZER_PATH, device=device)
def generate_audio(
text_input,
scene_description,
temperature,
top_p,
top_k,
max_new_tokens,
reference_audio_file
):
"""Generate audio from text using Higgs Audio v2."""
# Prepare system message
if scene_description:
system_prompt = (
"Generate audio following instruction.\n\n"
f"<|scene_desc_start|>\n{scene_description}\n<|scene_desc_end|>"
)
else:
system_prompt = (
"Generate audio following instruction.\n\n"
"<|scene_desc_start|>\nAudio is recorded from a quiet room.\n<|scene_desc_end|>"
)
messages = [
Message(role="system", content=system_prompt),
Message(role="user", content=text_input)
]
# Add reference audio if provided
if reference_audio_file is not None:
# For reference audio, we need to add a placeholder message
# In a full implementation, you would encode the reference audio
# and include it in the context
messages.append(
Message(
role="user",
content="[SPEAKER0] This is a reference voice sample."
)
)
messages.append(
Message(
role="assistant",
content=AudioContent(audio_url=reference_audio_file.name)
)
)
try:
# Generate audio
output: HiggsAudioResponse = serve_engine.generate(
chat_ml_sample=ChatMLSample(messages=messages),
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
stop_strings=["<|end_of_text|>", "<|eot_id|>"],
)
# Save audio to temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
sf.write(tmp_file.name, output.audio, output.sampling_rate)
return tmp_file.name
except Exception as e:
raise gr.Error(f"Error during generation: {str(e)}")
# Gradio interface
with gr.Blocks(title="Higgs Audio v2") as demo:
gr.Markdown("""
# 🎵 Higgs Audio v2: Expressive Audio Generation
Generate expressive speech from text with Higgs Audio v2.
For best results, use a GPU-enabled space.
""")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter text to convert to speech...",
lines=5
)
scene_description = gr.Textbox(
label="Scene Description (Optional)",
placeholder="Describe the audio environment (e.g., 'Audio recorded in a noisy cafe')",
lines=2
)
reference_audio = gr.Audio(
label="Reference Audio (Optional) - Voice Cloning",
type="filepath"
)
with gr.Accordion("Generation Parameters", open=False):
temperature = gr.Slider(
minimum=0.1, maximum=2.0, value=0.7, step=0.1,
label="Temperature"
)
top_p = gr.Slider(
minimum=0.1, maximum=1.0, value=0.95, step=0.05,
label="Top-p (nucleus sampling)"
)
top_k = gr.Slider(
minimum=1, maximum=100, value=50, step=1,
label="Top-k"
)
max_new_tokens = gr.Slider(
minimum=128, maximum=4096, value=1024, step=128,
label="Max New Tokens"
)
generate_btn = gr.Button("Generate Audio", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Generated Audio")
generate_btn.click(
generate_audio,
inputs=[
text_input,
scene_description,
temperature,
top_p,
top_k,
max_new_tokens,
reference_audio
],
outputs=audio_output
)
# For HF Spaces, we need to set up proper sharing
if __name__ == "__main__":
demo.launch(share=True)