File size: 5,001 Bytes
3464b5a
6db2a5b
3464b5a
 
 
 
 
 
6db2a5b
3464b5a
 
6db2a5b
3464b5a
 
 
6db2a5b
3464b5a
 
 
6db2a5b
3464b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6db2a5b
3464b5a
 
 
 
6db2a5b
3464b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6db2a5b
3464b5a
 
 
 
 
 
 
 
 
 
6db2a5b
3464b5a
 
 
 
 
 
 
6db2a5b
3464b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6db2a5b
3464b5a
6db2a5b
3464b5a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""Gradio app for Higgs Audio v2 on Hugging Face Spaces."""

import gradio as gr
import torch
import torchaudio
import soundfile as sf
import os
import tempfile

from boson_multimodal.serve.serve_engine import HiggsAudioServeEngine, HiggsAudioResponse
from boson_multimodal.data_types import ChatMLSample, Message, AudioContent

# Model and tokenizer paths
MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer"

# Initialize the engine once at startup
device = "cuda" if torch.cuda.is_available() else "cpu"
serve_engine = HiggsAudioServeEngine(MODEL_PATH, AUDIO_TOKENIZER_PATH, device=device)

def generate_audio(
    text_input, 
    scene_description, 
    temperature, 
    top_p, 
    top_k, 
    max_new_tokens,
    reference_audio_file
):
    """Generate audio from text using Higgs Audio v2."""
    # Prepare system message
    if scene_description:
        system_prompt = (
            "Generate audio following instruction.\n\n"
            f"<|scene_desc_start|>\n{scene_description}\n<|scene_desc_end|>"
        )
    else:
        system_prompt = (
            "Generate audio following instruction.\n\n"
            "<|scene_desc_start|>\nAudio is recorded from a quiet room.\n<|scene_desc_end|>"
        )

    messages = [
        Message(role="system", content=system_prompt),
        Message(role="user", content=text_input)
    ]

    # Add reference audio if provided
    if reference_audio_file is not None:
        # For reference audio, we need to add a placeholder message
        # In a full implementation, you would encode the reference audio
        # and include it in the context
        messages.append(
            Message(
                role="user", 
                content="[SPEAKER0] This is a reference voice sample."
            )
        )
        messages.append(
            Message(
                role="assistant",
                content=AudioContent(audio_url=reference_audio_file.name)
            )
        )

    try:
        # Generate audio
        output: HiggsAudioResponse = serve_engine.generate(
            chat_ml_sample=ChatMLSample(messages=messages),
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            stop_strings=["<|end_of_text|>", "<|eot_id|>"],
        )

        # Save audio to temporary file
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
            sf.write(tmp_file.name, output.audio, output.sampling_rate)
            return tmp_file.name
            
    except Exception as e:
        raise gr.Error(f"Error during generation: {str(e)}")

# Gradio interface
with gr.Blocks(title="Higgs Audio v2") as demo:
    gr.Markdown("""
    # 🎵 Higgs Audio v2: Expressive Audio Generation
    
    Generate expressive speech from text with Higgs Audio v2. 
    For best results, use a GPU-enabled space.
    """)
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Input Text",
                placeholder="Enter text to convert to speech...",
                lines=5
            )
            
            scene_description = gr.Textbox(
                label="Scene Description (Optional)",
                placeholder="Describe the audio environment (e.g., 'Audio recorded in a noisy cafe')",
                lines=2
            )
            
            reference_audio = gr.Audio(
                label="Reference Audio (Optional) - Voice Cloning",
                type="filepath"
            )
            
            with gr.Accordion("Generation Parameters", open=False):
                temperature = gr.Slider(
                    minimum=0.1, maximum=2.0, value=0.7, step=0.1,
                    label="Temperature"
                )
                
                top_p = gr.Slider(
                    minimum=0.1, maximum=1.0, value=0.95, step=0.05,
                    label="Top-p (nucleus sampling)"
                )
                
                top_k = gr.Slider(
                    minimum=1, maximum=100, value=50, step=1,
                    label="Top-k"
                )
                
                max_new_tokens = gr.Slider(
                    minimum=128, maximum=4096, value=1024, step=128,
                    label="Max New Tokens"
                )
            
            generate_btn = gr.Button("Generate Audio", variant="primary")
            
        with gr.Column():
            audio_output = gr.Audio(label="Generated Audio")
    
    generate_btn.click(
        generate_audio,
        inputs=[
            text_input, 
            scene_description, 
            temperature, 
            top_p, 
            top_k, 
            max_new_tokens,
            reference_audio
        ],
        outputs=audio_output
    )

# For HF Spaces, we need to set up proper sharing
if __name__ == "__main__":
    demo.launch(share=True)