"""Gradio app for Higgs Audio v2 on Hugging Face Spaces.""" import gradio as gr import torch import torchaudio import soundfile as sf import os import tempfile from boson_multimodal.serve.serve_engine import HiggsAudioServeEngine, HiggsAudioResponse from boson_multimodal.data_types import ChatMLSample, Message, AudioContent # Model and tokenizer paths MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base" AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer" # Initialize the engine once at startup device = "cuda" if torch.cuda.is_available() else "cpu" serve_engine = HiggsAudioServeEngine(MODEL_PATH, AUDIO_TOKENIZER_PATH, device=device) def generate_audio( text_input, scene_description, temperature, top_p, top_k, max_new_tokens, reference_audio_file ): """Generate audio from text using Higgs Audio v2.""" # Prepare system message if scene_description: system_prompt = ( "Generate audio following instruction.\n\n" f"<|scene_desc_start|>\n{scene_description}\n<|scene_desc_end|>" ) else: system_prompt = ( "Generate audio following instruction.\n\n" "<|scene_desc_start|>\nAudio is recorded from a quiet room.\n<|scene_desc_end|>" ) messages = [ Message(role="system", content=system_prompt), Message(role="user", content=text_input) ] # Add reference audio if provided if reference_audio_file is not None: # For reference audio, we need to add a placeholder message # In a full implementation, you would encode the reference audio # and include it in the context messages.append( Message( role="user", content="[SPEAKER0] This is a reference voice sample." ) ) messages.append( Message( role="assistant", content=AudioContent(audio_url=reference_audio_file.name) ) ) try: # Generate audio output: HiggsAudioResponse = serve_engine.generate( chat_ml_sample=ChatMLSample(messages=messages), max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k, stop_strings=["<|end_of_text|>", "<|eot_id|>"], ) # Save audio to temporary file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: sf.write(tmp_file.name, output.audio, output.sampling_rate) return tmp_file.name except Exception as e: raise gr.Error(f"Error during generation: {str(e)}") # Gradio interface with gr.Blocks(title="Higgs Audio v2") as demo: gr.Markdown(""" # 🎵 Higgs Audio v2: Expressive Audio Generation Generate expressive speech from text with Higgs Audio v2. For best results, use a GPU-enabled space. """) with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Input Text", placeholder="Enter text to convert to speech...", lines=5 ) scene_description = gr.Textbox( label="Scene Description (Optional)", placeholder="Describe the audio environment (e.g., 'Audio recorded in a noisy cafe')", lines=2 ) reference_audio = gr.Audio( label="Reference Audio (Optional) - Voice Cloning", type="filepath" ) with gr.Accordion("Generation Parameters", open=False): temperature = gr.Slider( minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)" ) top_k = gr.Slider( minimum=1, maximum=100, value=50, step=1, label="Top-k" ) max_new_tokens = gr.Slider( minimum=128, maximum=4096, value=1024, step=128, label="Max New Tokens" ) generate_btn = gr.Button("Generate Audio", variant="primary") with gr.Column(): audio_output = gr.Audio(label="Generated Audio") generate_btn.click( generate_audio, inputs=[ text_input, scene_description, temperature, top_p, top_k, max_new_tokens, reference_audio ], outputs=audio_output ) # For HF Spaces, we need to set up proper sharing if __name__ == "__main__": demo.launch(share=True)