|
"""Gradio app for Higgs Audio v2 on Hugging Face Spaces.""" |
|
|
|
import gradio as gr |
|
import torch |
|
import torchaudio |
|
import soundfile as sf |
|
import os |
|
import tempfile |
|
|
|
from boson_multimodal.serve.serve_engine import HiggsAudioServeEngine, HiggsAudioResponse |
|
from boson_multimodal.data_types import ChatMLSample, Message, AudioContent |
|
|
|
|
|
MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base" |
|
AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer" |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
serve_engine = HiggsAudioServeEngine(MODEL_PATH, AUDIO_TOKENIZER_PATH, device=device) |
|
|
|
def generate_audio( |
|
text_input, |
|
scene_description, |
|
temperature, |
|
top_p, |
|
top_k, |
|
max_new_tokens, |
|
reference_audio_file |
|
): |
|
"""Generate audio from text using Higgs Audio v2.""" |
|
|
|
if scene_description: |
|
system_prompt = ( |
|
"Generate audio following instruction.\n\n" |
|
f"<|scene_desc_start|>\n{scene_description}\n<|scene_desc_end|>" |
|
) |
|
else: |
|
system_prompt = ( |
|
"Generate audio following instruction.\n\n" |
|
"<|scene_desc_start|>\nAudio is recorded from a quiet room.\n<|scene_desc_end|>" |
|
) |
|
|
|
messages = [ |
|
Message(role="system", content=system_prompt), |
|
Message(role="user", content=text_input) |
|
] |
|
|
|
|
|
if reference_audio_file is not None: |
|
|
|
|
|
|
|
messages.append( |
|
Message( |
|
role="user", |
|
content="[SPEAKER0] This is a reference voice sample." |
|
) |
|
) |
|
messages.append( |
|
Message( |
|
role="assistant", |
|
content=AudioContent(audio_url=reference_audio_file.name) |
|
) |
|
) |
|
|
|
try: |
|
|
|
output: HiggsAudioResponse = serve_engine.generate( |
|
chat_ml_sample=ChatMLSample(messages=messages), |
|
max_new_tokens=max_new_tokens, |
|
temperature=temperature, |
|
top_p=top_p, |
|
top_k=top_k, |
|
stop_strings=["<|end_of_text|>", "<|eot_id|>"], |
|
) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: |
|
sf.write(tmp_file.name, output.audio, output.sampling_rate) |
|
return tmp_file.name |
|
|
|
except Exception as e: |
|
raise gr.Error(f"Error during generation: {str(e)}") |
|
|
|
|
|
with gr.Blocks(title="Higgs Audio v2") as demo: |
|
gr.Markdown(""" |
|
# 🎵 Higgs Audio v2: Expressive Audio Generation |
|
|
|
Generate expressive speech from text with Higgs Audio v2. |
|
For best results, use a GPU-enabled space. |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
text_input = gr.Textbox( |
|
label="Input Text", |
|
placeholder="Enter text to convert to speech...", |
|
lines=5 |
|
) |
|
|
|
scene_description = gr.Textbox( |
|
label="Scene Description (Optional)", |
|
placeholder="Describe the audio environment (e.g., 'Audio recorded in a noisy cafe')", |
|
lines=2 |
|
) |
|
|
|
reference_audio = gr.Audio( |
|
label="Reference Audio (Optional) - Voice Cloning", |
|
type="filepath" |
|
) |
|
|
|
with gr.Accordion("Generation Parameters", open=False): |
|
temperature = gr.Slider( |
|
minimum=0.1, maximum=2.0, value=0.7, step=0.1, |
|
label="Temperature" |
|
) |
|
|
|
top_p = gr.Slider( |
|
minimum=0.1, maximum=1.0, value=0.95, step=0.05, |
|
label="Top-p (nucleus sampling)" |
|
) |
|
|
|
top_k = gr.Slider( |
|
minimum=1, maximum=100, value=50, step=1, |
|
label="Top-k" |
|
) |
|
|
|
max_new_tokens = gr.Slider( |
|
minimum=128, maximum=4096, value=1024, step=128, |
|
label="Max New Tokens" |
|
) |
|
|
|
generate_btn = gr.Button("Generate Audio", variant="primary") |
|
|
|
with gr.Column(): |
|
audio_output = gr.Audio(label="Generated Audio") |
|
|
|
generate_btn.click( |
|
generate_audio, |
|
inputs=[ |
|
text_input, |
|
scene_description, |
|
temperature, |
|
top_p, |
|
top_k, |
|
max_new_tokens, |
|
reference_audio |
|
], |
|
outputs=audio_output |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=True) |