SeaLLM-7B-v2.5-simple

Runtime error

App Files Files Community

lukecq commited on Mar 16

Commit

9d1731e

verified ·

1 Parent(s): 531980d

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -156

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 import time
 from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
 from io import BytesIO
 from urllib.request import urlopen
@@ -7,56 +8,47 @@ import librosa
 import os, json
 from sys import argv
 from vllm import LLM, SamplingParams
-# def load_model_processor(model_path):
-#     processor = AutoProcessor.from_pretrained(model_path)
-#     llm = LLM(
-#         model=model_path, trust_remote_code=True, gpu_memory_utilization=0.8,
-#         enforce_eager=True,  device = "cuda",
-#         limit_mm_per_prompt={"audio": 5},
-#     )
-#     return llm, processor
 def load_model_processor(model_path):
     processor = AutoProcessor.from_pretrained(model_path)
-    model = Qwen2AudioForConditionalGeneration.from_pretrained(model_path, device_map="auto")
-    model_name = model_path.split("/")[-1]
-    return model, processor, model_name
-model_path1 = "Qwen/Qwen2-Audio-7B-Instruct" #argv[1]
 model1, processor1 = load_model_processor(model_path1)
-# def response_to_audio_conv(conversation, model=None, processor=None, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,
-#                     max_new_tokens = 2048):
-#     text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-#     audios = []
-#     for message in conversation:
-#         if isinstance(message["content"], list):
-#             for ele in message["content"]:
-#                 if ele["type"] == "audio":
-#                     if ele['audio_url'] != None:
-#                         audios.append(librosa.load(
-#                             ele['audio_url'],
-#                             sr=processor.feature_extractor.sampling_rate)[0]
-#                         )
-#     sampling_params = SamplingParams(
-#         temperature=temperature, max_tokens=max_new_tokens, repetition_penalty=repetition_penalty, top_p=top_p, top_k=20,
-#         stop_token_ids=[],
-#     )
-#     input = {
-#             'prompt': text,
-#             'multi_modal_data': {
-#                 'audio': [(audio, 16000) for audio in audios]
-#             }
-#             }
-#     output = model.generate([input], sampling_params=sampling_params)[0]
-#     response = output.outputs[0].text
-#     return response
-def response_to_audio_conv(conversation, model=None, processor=None, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,max_new_tokens = 2048):
     text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
     audios = []
     for message in conversation:
@@ -68,103 +60,49 @@ def response_to_audio_conv(conversation, model=None, processor=None, temperature
                             ele['audio_url'],
                             sr=processor.feature_extractor.sampling_rate)[0]
                         )
-    if audios != []:
-        inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True,sampling_rate=16000)
-    else:
-        inputs = processor(text=text, return_tensors="pt", padding=True)
-    inputs.input_ids = inputs.input_ids.to("cuda")
-    inputs = {k: v.to("cuda") for k, v in inputs.items() if v is not None}
-    generate_ids = model.generate(**inputs, max_new_tokens=2048, temperature = 0.3, do_sample=True)
-    generate_ids = generate_ids[:, inputs["input_ids"].size(1):]
-    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
     return response
-def print_like_dislike(x: gr.LikeData):
-    print(x.index, x.value, x.liked)
-def add_message(history, message):
-    paths = []
-    for turn in history:
-        if turn['role'] == "user" and type(turn['content']) != str:
-            paths.append(turn['content'][0])
-    for x in message["files"]:
-        if x not in paths:
-            history.append({"role": "user", "content": {"path": x}})
-    if message["text"] is not None:
-        history.append({"role": "user", "content": message["text"]})
-    return history, gr.MultimodalTextbox(value=None, interactive=False)
-def format_user_messgae(message):
-    if type(message['content']) == str:
-        return {"role": "user", "content": [{"type": "text", "text": message['content']}]}
-    else:
-        return {"role": "user", "content": [{"type": "audio", "audio_url": message['content'][0]}]}
-def history_to_conversation(history):
-    conversation = []
-    audio_paths = []
-    for turn in history:
-        if turn['role'] == "user":
-            if not turn['content']:
-                continue
-            turn = format_user_messgae(turn)
-            if turn['content'][0]['type'] == 'audio':
-                if turn['content'][0]['audio_url'] in audio_paths:
-                    continue
-                else:
-                    audio_paths.append(turn['content'][0]['audio_url'])
-            if len(conversation) > 0 and conversation[-1]["role"] == "user":
-                conversation[-1]['content'].append(turn['content'][0])
-            else:
-                conversation.append(turn)
-        else:
-            conversation.append(turn)
-    print(json.dumps(conversation, indent=4, ensure_ascii=False))
-    return conversation
-def bot(history: list, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,
-                    max_new_tokens = 2048):
-    conversation = history_to_conversation(history)
-    response = response_to_audio_conv(conversation, model=model1, processor=processor1, temperature = temperature,repetition_penalty=repetition_penalty, top_p = top_p, max_new_tokens = max_new_tokens)
-    # response = "Nice to meet you!"
-    print("Bot:",response)
-    history.append({"role": "assistant", "content": ""})
-    for character in response:
-        history[-1]["content"] += character
-        time.sleep(0.01)
-        yield history
-insturctions = """**Instruction**: there are three input format:
-    1. text: input text message only
-    2. audio: upload audio file or record a voice message
-    3. audio + text: record a voice message and input text message"""
 with gr.Blocks() as demo:
     # gr.Markdown("""<p align="center"><img src="images/seal_logo.png" style="height: 80px"/><p>""")
     # gr.Image("images/seal_logo.png", elem_id="seal_logo", show_label=False,height=80,show_fullscreen_button=False)
     gr.Markdown(
-        """<div style="text-align: center; font-size: 32px; font-weight: bold;">SeaLLMs-Audio ChatBot</div>""",
-    )
-    # Description text
-    gr.Markdown(
-        """<div style="text-align: center; font-size: 16px;">
-    This WebUI is based on SeaLLMs-Audio-7B-Chat, developed by Alibaba DAMO Academy.<br>
     You can interact with the chatbot in <b>English, Chinese, Indonesian, Thai, or Vietnamese</b>.<br>
-    For each round, you can input <b>audio and/or text</b>.
-    </div>""",
-    )
     # Links with proper formatting
     gr.Markdown(
-        """<div style="text-align: center; font-size: 16px;">
         <a href="https://huggingface.co/SeaLLMs/SeaLLMs-v3-7B-Chat">[Website]</a> &nbsp;
         <a href="https://huggingface.co/SeaLLMs/SeaLLMs-v3-7B-Chat">[Model🤗]</a> &nbsp;
         <a href="https://github.com/liuchaoqun/SeaLLMs-Audio">[Github]</a>
-        </div>""",
     )
     # gr.Markdown(insturctions)
@@ -175,36 +113,43 @@ with gr.Blocks() as demo:
     #         top_p = gr.Slider(minimum=0.1, maximum=1, value=0.5, step=0.1, label="Top P")
     #     with gr.Column():
     #         repetition_penalty = gr.Slider(minimum=0, maximum=2, value=1.1, step=0.1, label="Repetition Penalty")
-    chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages")
-    chat_input = gr.MultimodalTextbox(
-        interactive=True,
-        file_count="single",
-        file_types=['.wav'],
-        placeholder="Enter message (optional) ...",
-        show_label=False,
-        sources=["microphone", "upload"],
     )
-    chat_msg = chat_input.submit(
-        add_message, [chatbot, chat_input], [chatbot, chat_input]
     )
-    bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response")
-    # bot_msg = chat_msg.then(bot, [chatbot, temperature, repetition_penalty, top_p], chatbot, api_name="bot_response")
-    bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
-    # chatbot.like(print_like_dislike, None, None, like_user_message=True)
-    clear_button = gr.ClearButton([chatbot, chat_input])
-# PORT = 7950
-# demo.launch(server_port=PORT, show_api = True, allowed_paths = [],
-#     root_path = f"https://dsw-gateway.alibaba-inc.com/dsw81322/proxy/{PORT}/")
-demo.launch(
-    share=False,
-    inbrowser=True,
-    server_port=7950,
-    server_name="0.0.0.0",
-    max_threads=40
-)

 import gradio as gr
 import time
+import transformers
 from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
 from io import BytesIO
 from urllib.request import urlopen
 import os, json
 from sys import argv
 from vllm import LLM, SamplingParams
+import vllm
+from huggingface_hub import login
+TOKEN = os.environ.get("TOKEN", None)
+login(token=TOKEN)
+print("transformers version:", transformers.__version__)
+print("vllm version:", vllm.__version__)
+print("gradio version:", gr.__version__)
 def load_model_processor(model_path):
     processor = AutoProcessor.from_pretrained(model_path)
+    llm = LLM(
+        model=model_path, trust_remote_code=True, gpu_memory_utilization=0.8,
+        enforce_eager=True,  device = "cuda",
+        limit_mm_per_prompt={"audio": 5},
+    )
+    return llm, processor
+model_path1 = "SeaLLMs/SeaLLMs-Audio-7B"
 model1, processor1 = load_model_processor(model_path1)
+def response_to_audio(audio_url, text, model=None, processor=None, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,max_new_tokens = 2048):
+    if text == None:
+        conversation = [
+            {"role": "user", "content": [
+                {"type": "audio", "audio_url": audio_url},
+            ]},]
+    elif audio_url == None:
+        conversation = [
+            {"role": "user", "content": [
+                {"type": "text", "text": text},
+           ]},]
+    else:
+        conversation = [
+            {"role": "user", "content": [
+                {"type": "audio", "audio_url": audio_url},
+                {"type": "text", "text": text},
+           ]},]
     text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
     audios = []
     for message in conversation:
                             ele['audio_url'],
                             sr=processor.feature_extractor.sampling_rate)[0]
                         )
+    sampling_params = SamplingParams(
+        temperature=temperature, max_tokens=max_new_tokens, repetition_penalty=repetition_penalty, top_p=top_p, top_k=20,
+        stop_token_ids=[],
+    )
+    input = {
+            'prompt': text,
+            'multi_modal_data': {
+                'audio': [(audio, 16000) for audio in audios]
+            }
+            }
+    output = model.generate([input], sampling_params=sampling_params)[0]
+    response = output.outputs[0].text
     return response
+def clear_inputs():
+    return None, "", ""
+def compare_responses(audio_url, text):
+    response1 = response_to_audio(audio_url, text, model1, processor1)
+    return response1
 with gr.Blocks() as demo:
+    # gr.Markdown(f"Evaluate {model_path1}")
     # gr.Markdown("""<p align="center"><img src="images/seal_logo.png" style="height: 80px"/><p>""")
     # gr.Image("images/seal_logo.png", elem_id="seal_logo", show_label=False,height=80,show_fullscreen_button=False)
+    # gr.Markdown("""<center><font size=8>SeaLLMs-Audio Demo</center>""")
+    gr.Markdown("""# SeaLLMs-Audio Demo""")
     gr.Markdown(
+        """\
+<center><font size=4>This WebUI is based on SeaLLMs-Audio-7B-Chat, developed by Alibaba DAMO Academy.<br>
     You can interact with the chatbot in <b>English, Chinese, Indonesian, Thai, or Vietnamese</b>.<br>
+    For the input, you can input <b>audio and/or text</center>.""")
     # Links with proper formatting
     gr.Markdown(
+        """<center><font size=4>
         <a href="https://huggingface.co/SeaLLMs/SeaLLMs-v3-7B-Chat">[Website]</a> &nbsp;
         <a href="https://huggingface.co/SeaLLMs/SeaLLMs-v3-7B-Chat">[Model🤗]</a> &nbsp;
         <a href="https://github.com/liuchaoqun/SeaLLMs-Audio">[Github]</a>
+        </center>""",
     )
     # gr.Markdown(insturctions)
     #         top_p = gr.Slider(minimum=0.1, maximum=1, value=0.5, step=0.1, label="Top P")
     #     with gr.Column():
     #         repetition_penalty = gr.Slider(minimum=0, maximum=2, value=1.1, step=0.1, label="Repetition Penalty")
+    with gr.Row():
+        with gr.Column():
+            # mic_input = gr.Microphone(label="Record Audio", type="filepath", elem_id="mic_input")
+            mic_input = gr.Audio(sources = ['upload', 'microphone'], label="Record Audio", type="filepath", elem_id="mic_input")
+        with gr.Column():
+            additional_input = gr.Textbox(label="Text Input")
+    # Button to trigger the function
+    with gr.Row():
+        btn_submit = gr.Button("Submit")
+        btn_clear = gr.Button("Clear")
+    with gr.Row():
+        output_text1 = gr.Textbox(label=model_path1.split('/')[-1], interactive=False, elem_id="output_text1")
+    btn_submit.click(
+        fn=compare_responses,
+        inputs=[mic_input, additional_input],
+        outputs=[output_text1],
     )
+    btn_clear.click(
+        fn=clear_inputs,
+        inputs=None,
+        outputs=[mic_input, additional_input, output_text1],
+        queue=False,
     )
+# demo.launch(
+#     share=False,
+#     inbrowser=True,
+#     server_port=7950,
+#     server_name="0.0.0.0",
+#     max_threads=40
+# )
+demo.launch(share=True)
+demo.queue(default_concurrency_limit=40).launch(share=True)