File size: 4,122 Bytes
4dd1b2a
9cdb7cc
 
 
 
 
 
 
 
 
 
 
4dd1b2a
9cdb7cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4dd1b2a
 
 
9cdb7cc
 
4dd1b2a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gradio as gr
from transformers import AutoModelForCausalLM, AutoProcessor
import librosa

def split_audio(audio_arrays, chunk_limit=480000):
    CHUNK_LIM = chunk_limit
    audio_splits = []
    # Split the loaded audio to 30s chunks and extend the messages content
    for i in range(
        0,
        len(audio_arrays),
        CHUNK_LIM,
    ):
        audio_splits.append(audio_arrays[i : i + CHUNK_LIM])
    return audio_splits


# Placeholder for your actual LLM processing API call
def process_audio(audio, text, chat_history):
    conversation = [
        {
            "role": "user",
            "content": [
            ],
        },
    ]
    audio = librosa.load(audio, sr=16000)[0]

    if audio is not None:
        splitted_audio = split_audio(audio)
        for au in splitted_audio:
            conversation[0]["content"].append(
                {
                    "type": "audio_url",
                    "audio": "placeholder",
                }
            )
        chat_history.append({"role": "user", "content": gr.Audio(value=(16000, audio))})
    
    conversation[0]["content"].append(
        {
            "type": "text",
            "text": text,
        }
    )

    chat_history.append({"role": "user", "content": text})
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(text=prompt, audios=splitted_audio, sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    outputs = model.generate(**inputs, eos_token_id=151645, pad_token_id=151643, max_new_tokens=4096)

    cont = outputs[:, inputs["input_ids"].shape[-1] :]

    result = processor.batch_decode(cont, skip_special_tokens=True)[0]
    chat_history.append(
        {
            "role": "assistant",
            "content": result,
        }
    )

    return chat_history

with gr.Blocks() as demo:
    gr.Markdown("## 🎙️ Aero-1-Audio")
    gr.Markdown(
    """
    Aero-1-Audio is a compact audio model. With only 1.5B parameters and 50k hours training data, it can perform a variety of tasks, including:
    ASR, basic Audio Understanding, Audio Instruction Following, and scene analysis

    We provide several examples such as:
    - nvidia conference and a show from elon musk for long ASR
    - Simple Audio Instruction Following
    - Audio Understanding for weather and music

    The model might not be able to follow your instruction in multiple cases and might be wrong in many times

    """
    )

    chatbot = gr.Chatbot(type="messages")

    with gr.Row(variant="compact", equal_height=True):
        audio_input = gr.Audio(label="Speak Here", type="filepath")
        text_input = gr.Textbox(label="Text Input", placeholder="Type here", interactive=True)

    
    with gr.Row():
        chatbot_clear = gr.ClearButton([text_input, audio_input, chatbot], value="Clear")
        chatbot_submit = gr.Button("Submit", variant="primary")
        chatbot_submit.click(
            process_audio,
            inputs=[audio_input, text_input, chatbot],
            outputs=[chatbot],
        )
    
    gr.Examples(
        [
            ["Please transcribe the audio for me", "./examples/elon_musk.mp3"],
            ["Please transcribe the audio for me", "./examples/nvidia_conference.mp3"],
            ["Please transcribe the audio for me", "./examples/nuggets.mp3"],
            ["Please follow the instruction in the audio", "./examples/audio_instruction.wav"],
            ["What is the primary instrument featured in the solo of this track?", "./examples/music_under.wav"],
            ["What weather condition can be heard in the audio?", "./examples/audio_understand.wav"],
        ],
        inputs=[text_input, audio_input],
        label="Examples",
    )


if __name__ == "__main__":
    processor = AutoProcessor.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", device_map="cuda", torch_dtype="auto", attn_implementation="sdpa", trust_remote_code=True)
    demo.launch()