gradio / app.py
glodov's picture
Audio generation.
3464b5a verified
raw
history blame
5 kB
"""Gradio app for Higgs Audio v2 on Hugging Face Spaces."""
import gradio as gr
import torch
import torchaudio
import soundfile as sf
import os
import tempfile
from boson_multimodal.serve.serve_engine import HiggsAudioServeEngine, HiggsAudioResponse
from boson_multimodal.data_types import ChatMLSample, Message, AudioContent
# Model and tokenizer paths
MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer"
# Initialize the engine once at startup
device = "cuda" if torch.cuda.is_available() else "cpu"
serve_engine = HiggsAudioServeEngine(MODEL_PATH, AUDIO_TOKENIZER_PATH, device=device)
def generate_audio(
text_input,
scene_description,
temperature,
top_p,
top_k,
max_new_tokens,
reference_audio_file
):
"""Generate audio from text using Higgs Audio v2."""
# Prepare system message
if scene_description:
system_prompt = (
"Generate audio following instruction.\n\n"
f"<|scene_desc_start|>\n{scene_description}\n<|scene_desc_end|>"
)
else:
system_prompt = (
"Generate audio following instruction.\n\n"
"<|scene_desc_start|>\nAudio is recorded from a quiet room.\n<|scene_desc_end|>"
)
messages = [
Message(role="system", content=system_prompt),
Message(role="user", content=text_input)
]
# Add reference audio if provided
if reference_audio_file is not None:
# For reference audio, we need to add a placeholder message
# In a full implementation, you would encode the reference audio
# and include it in the context
messages.append(
Message(
role="user",
content="[SPEAKER0] This is a reference voice sample."
)
)
messages.append(
Message(
role="assistant",
content=AudioContent(audio_url=reference_audio_file.name)
)
)
try:
# Generate audio
output: HiggsAudioResponse = serve_engine.generate(
chat_ml_sample=ChatMLSample(messages=messages),
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
stop_strings=["<|end_of_text|>", "<|eot_id|>"],
)
# Save audio to temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
sf.write(tmp_file.name, output.audio, output.sampling_rate)
return tmp_file.name
except Exception as e:
raise gr.Error(f"Error during generation: {str(e)}")
# Gradio interface
with gr.Blocks(title="Higgs Audio v2") as demo:
gr.Markdown("""
# 🎵 Higgs Audio v2: Expressive Audio Generation
Generate expressive speech from text with Higgs Audio v2.
For best results, use a GPU-enabled space.
""")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter text to convert to speech...",
lines=5
)
scene_description = gr.Textbox(
label="Scene Description (Optional)",
placeholder="Describe the audio environment (e.g., 'Audio recorded in a noisy cafe')",
lines=2
)
reference_audio = gr.Audio(
label="Reference Audio (Optional) - Voice Cloning",
type="filepath"
)
with gr.Accordion("Generation Parameters", open=False):
temperature = gr.Slider(
minimum=0.1, maximum=2.0, value=0.7, step=0.1,
label="Temperature"
)
top_p = gr.Slider(
minimum=0.1, maximum=1.0, value=0.95, step=0.05,
label="Top-p (nucleus sampling)"
)
top_k = gr.Slider(
minimum=1, maximum=100, value=50, step=1,
label="Top-k"
)
max_new_tokens = gr.Slider(
minimum=128, maximum=4096, value=1024, step=128,
label="Max New Tokens"
)
generate_btn = gr.Button("Generate Audio", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Generated Audio")
generate_btn.click(
generate_audio,
inputs=[
text_input,
scene_description,
temperature,
top_p,
top_k,
max_new_tokens,
reference_audio
],
outputs=audio_output
)
# For HF Spaces, we need to set up proper sharing
if __name__ == "__main__":
demo.launch(share=True)