File size: 3,840 Bytes
5a813b2
 
b8750fa
0729bfc
 
b8750fa
5a813b2
0729bfc
5a813b2
 
 
 
 
 
0729bfc
5a813b2
 
 
 
b8750fa
 
 
 
 
 
 
 
 
 
0729bfc
b8750fa
 
 
 
 
 
 
 
 
0729bfc
 
b8750fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0729bfc
 
 
 
b8750fa
 
 
 
 
 
 
 
 
 
 
 
 
5a813b2
b8750fa
8c6351b
b8750fa
 
 
1cc6fa9
8c6351b
0729bfc
 
 
 
b8750fa
0729bfc
 
 
b8750fa
0729bfc
b8750fa
0729bfc
 
 
 
 
 
b8750fa
0729bfc
b8750fa
 
 
 
0729bfc
 
 
 
 
 
 
5a813b2
8c6351b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import base64
import cv2
import streamlit as st
import openai
from threading import Lock, Thread
from cv2 import VideoCapture, imencode
from dotenv import load_dotenv
from playsound import playsound
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema.messages import SystemMessage
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI
from streamlit_webrtc import webrtc_streamer, VideoTransformerBase

load_dotenv()

class Assistant:
    def __init__(self, model):
        self.chain = self._create_inference_chain(model)

    def answer(self, prompt, image):
        if not prompt:
            return

        print("Prompt:", prompt)

        response = self.chain.invoke(
            {"prompt": prompt, "image_base64": image},
            config={"configurable": {"session_id": "unused"}},
        ).strip()

        print("Response:", response)

        if response:
            self._tts(response)

    def _tts(self, response):
        # Simple TTS simulation: Print to the console.
        print(f"TTS Response: {response}")

    def _create_inference_chain(self, model):
        SYSTEM_PROMPT = """
        You are a witty assistant that will use the chat history and the image 
        provided by the user to answer its questions. Your job is to answer 
        questions.

        Use few words on your answers. Go straight to the point. Do not use any
        emoticons or emojis. 

        Be friendly and helpful. Show some personality.
        """

        prompt_template = ChatPromptTemplate.from_messages(
            [
                SystemMessage(content=SYSTEM_PROMPT),
                MessagesPlaceholder(variable_name="chat_history"),
                ("human", [
                    {"type": "text", "text": "{prompt}"},
                    {"type": "image_url", "image_url": "data:image/jpeg;base64,{image_base64}"}
                ]),
            ]
        )

        chain = prompt_template | model | StrOutputParser()

        chat_message_history = ChatMessageHistory()
        return RunnableWithMessageHistory(
            chain,
            lambda _: chat_message_history,
            input_messages_key="prompt",
            history_messages_key="chat_history",
        )

def main():
    st.title("AI Assistant with Webcam Stream")

    # model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
    # You can use OpenAI's GPT-4o model instead of Gemini Flash by uncommenting the following line:
    model = ChatOpenAI(model="gpt-4o")
    assistant = Assistant(model)

    class VideoProcessor(VideoTransformerBase):
        def __init__(self):
            self.lock = Lock()
            self.frame = None

        def transform(self, frame):
            with self.lock:
                self.frame = frame.to_ndarray(format="bgr24")

            return frame

        def get_base64_image(self):
            with self.lock:
                if self.frame is not None:
                    _, buffer = cv2.imencode('.jpeg', self.frame)
                    return base64.b64encode(buffer).decode('utf-8')
                return None

    ctx = webrtc_streamer(key="example", video_processor_factory=VideoProcessor)
    st.subheader("Ask the Assistant")

    prompt = st.text_input("Enter your question:")

    if ctx.video_processor:
        if st.button("Submit"):
            base64_image = ctx.video_processor.get_base64_image()
            if prompt and base64_image:
                assistant.answer(prompt, base64_image)
            else:
                st.warning("Please enter a prompt and ensure webcam feed is available.")

if __name__ == "__main__":
    main()