File size: 5,001 Bytes
3464b5a 6db2a5b 3464b5a 6db2a5b 3464b5a 6db2a5b 3464b5a 6db2a5b 3464b5a 6db2a5b 3464b5a 6db2a5b 3464b5a 6db2a5b 3464b5a 6db2a5b 3464b5a 6db2a5b 3464b5a 6db2a5b 3464b5a 6db2a5b 3464b5a 6db2a5b 3464b5a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
"""Gradio app for Higgs Audio v2 on Hugging Face Spaces."""
import gradio as gr
import torch
import torchaudio
import soundfile as sf
import os
import tempfile
from boson_multimodal.serve.serve_engine import HiggsAudioServeEngine, HiggsAudioResponse
from boson_multimodal.data_types import ChatMLSample, Message, AudioContent
# Model and tokenizer paths
MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer"
# Initialize the engine once at startup
device = "cuda" if torch.cuda.is_available() else "cpu"
serve_engine = HiggsAudioServeEngine(MODEL_PATH, AUDIO_TOKENIZER_PATH, device=device)
def generate_audio(
text_input,
scene_description,
temperature,
top_p,
top_k,
max_new_tokens,
reference_audio_file
):
"""Generate audio from text using Higgs Audio v2."""
# Prepare system message
if scene_description:
system_prompt = (
"Generate audio following instruction.\n\n"
f"<|scene_desc_start|>\n{scene_description}\n<|scene_desc_end|>"
)
else:
system_prompt = (
"Generate audio following instruction.\n\n"
"<|scene_desc_start|>\nAudio is recorded from a quiet room.\n<|scene_desc_end|>"
)
messages = [
Message(role="system", content=system_prompt),
Message(role="user", content=text_input)
]
# Add reference audio if provided
if reference_audio_file is not None:
# For reference audio, we need to add a placeholder message
# In a full implementation, you would encode the reference audio
# and include it in the context
messages.append(
Message(
role="user",
content="[SPEAKER0] This is a reference voice sample."
)
)
messages.append(
Message(
role="assistant",
content=AudioContent(audio_url=reference_audio_file.name)
)
)
try:
# Generate audio
output: HiggsAudioResponse = serve_engine.generate(
chat_ml_sample=ChatMLSample(messages=messages),
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
stop_strings=["<|end_of_text|>", "<|eot_id|>"],
)
# Save audio to temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
sf.write(tmp_file.name, output.audio, output.sampling_rate)
return tmp_file.name
except Exception as e:
raise gr.Error(f"Error during generation: {str(e)}")
# Gradio interface
with gr.Blocks(title="Higgs Audio v2") as demo:
gr.Markdown("""
# 🎵 Higgs Audio v2: Expressive Audio Generation
Generate expressive speech from text with Higgs Audio v2.
For best results, use a GPU-enabled space.
""")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter text to convert to speech...",
lines=5
)
scene_description = gr.Textbox(
label="Scene Description (Optional)",
placeholder="Describe the audio environment (e.g., 'Audio recorded in a noisy cafe')",
lines=2
)
reference_audio = gr.Audio(
label="Reference Audio (Optional) - Voice Cloning",
type="filepath"
)
with gr.Accordion("Generation Parameters", open=False):
temperature = gr.Slider(
minimum=0.1, maximum=2.0, value=0.7, step=0.1,
label="Temperature"
)
top_p = gr.Slider(
minimum=0.1, maximum=1.0, value=0.95, step=0.05,
label="Top-p (nucleus sampling)"
)
top_k = gr.Slider(
minimum=1, maximum=100, value=50, step=1,
label="Top-k"
)
max_new_tokens = gr.Slider(
minimum=128, maximum=4096, value=1024, step=128,
label="Max New Tokens"
)
generate_btn = gr.Button("Generate Audio", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Generated Audio")
generate_btn.click(
generate_audio,
inputs=[
text_input,
scene_description,
temperature,
top_p,
top_k,
max_new_tokens,
reference_audio
],
outputs=audio_output
)
# For HF Spaces, we need to set up proper sharing
if __name__ == "__main__":
demo.launch(share=True) |