File size: 2,746 Bytes
d43c6a1
 
 
 
86aa17d
d43c6a1
118f209
d43c6a1
 
 
 
 
86aa17d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d43c6a1
86aa17d
d43c6a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86aa17d
d43c6a1
 
86aa17d
d43c6a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86aa17d
 
9b24172
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
from PIL import Image
from inference.main import MultiModalPhi2

# Initialize the chatbot model
multimodal_phi2 = MultiModalPhi2(
    modelname_or_path="Navyabhat/Llava-Phi2",
    temperature=0.2,
    max_new_tokens=1024,
    device="cpu",
)

# Initialize chatbot history
messages = []

# UI setup
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(
        [],
        elem_id="chatbot",
        bubble_full_width=False,
        avatar_images=(None, None),  # You can specify avatar images if needed
    )

    with gr.Row():
        txt = gr.Textbox(
            scale=4,
            show_label=False,
            placeholder="Enter text and press enter, or upload an image",
            container=False,
        )
        btn = gr.UploadButton("📁", file_types=["image", "video", "audio"])

    txt_msg = txt.submit(add_content, [chatbot, txt], [chatbot, txt], queue=False).then(
        run, [chatbot, txt, None, None, None], [chatbot, txt, None, None, None], api_name="bot_response"
    )
    txt_msg.then(lambda: gr.Textbox(interactive=True), None, [txt], queue=False)
    file_msg = btn.upload(add_content, [chatbot, None, None, btn, None], [chatbot, None, None, None], queue=False).then(
        run, [chatbot, None, None, None, btn], [chatbot, None, None, None, None]
    )

    chatbot.like(print_like_dislike, None, None)

# Function to add content to chatbot
def add_content(chatbot, text, image, audio_upload, audio_mic) -> gr.Chatbot:
    textflag, imageflag, audioflag = False, False, False
    if text not in ["", None]:
        chatbot.append((text, None))
        textflag = True
    if image is not None:
        chatbot.append(((image,), None))
        imageflag = True
    if audio_mic is not None:
        chatbot.append(((audio_mic,), None))
        audioflag = True
    else:
        if audio_upload is not None:
            chatbot.append(((audio_upload,), None))
            audioflag = True
    if not any([textflag, imageflag, audioflag]):
        # Raise an error if neither text nor file is provided
        raise gr.Error("Enter a valid text, image, or audio")
    return chatbot

# Function to run the chatbot
def run(history, text, image, audio_upload, audio_mic):
    if text in [None, ""]:
        text = None

    if audio_upload is not None:
        audio = audio_upload
    elif audio_mic is not None:
        audio = audio_mic
    else:
        audio = None

    print("text", text)
    print("image", image)
    print("audio", audio)

    if image is not None:
        image = Image.open(image)
    outputs = multimodal_phi2(text, audio, image)

    history.append((None, outputs.title()))
    return history, None, None, None, None

# Launch the Gradio UI
demo.queue()
demo.launch()