import base64 import cv2 import streamlit as st import openai from threading import Lock, Thread from cv2 import VideoCapture, imencode from dotenv import load_dotenv from playsound import playsound from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain.schema.messages import SystemMessage from langchain_community.chat_message_histories import ChatMessageHistory from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables.history import RunnableWithMessageHistory from langchain_openai import ChatOpenAI from streamlit_webrtc import webrtc_streamer, VideoTransformerBase load_dotenv() class Assistant: def __init__(self, model): self.chain = self._create_inference_chain(model) def answer(self, prompt, image): if not prompt: return print("Prompt:", prompt) response = self.chain.invoke( {"prompt": prompt, "image_base64": image}, config={"configurable": {"session_id": "unused"}}, ).strip() print("Response:", response) if response: self._tts(response) def _tts(self, response): # Simple TTS simulation: Print to the console. print(f"TTS Response: {response}") def _create_inference_chain(self, model): SYSTEM_PROMPT = """ You are a witty assistant that will use the chat history and the image provided by the user to answer its questions. Your job is to answer questions. Use few words on your answers. Go straight to the point. Do not use any emoticons or emojis. Be friendly and helpful. Show some personality. """ prompt_template = ChatPromptTemplate.from_messages( [ SystemMessage(content=SYSTEM_PROMPT), MessagesPlaceholder(variable_name="chat_history"), ("human", [ {"type": "text", "text": "{prompt}"}, {"type": "image_url", "image_url": "data:image/jpeg;base64,{image_base64}"} ]), ] ) chain = prompt_template | model | StrOutputParser() chat_message_history = ChatMessageHistory() return RunnableWithMessageHistory( chain, lambda _: chat_message_history, input_messages_key="prompt", history_messages_key="chat_history", ) def main(): st.title("AI Assistant with Webcam Stream") # model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest") # You can use OpenAI's GPT-4o model instead of Gemini Flash by uncommenting the following line: model = ChatOpenAI(model="gpt-4o") assistant = Assistant(model) class VideoProcessor(VideoTransformerBase): def __init__(self): self.lock = Lock() self.frame = None def transform(self, frame): with self.lock: self.frame = frame.to_ndarray(format="bgr24") return frame def get_base64_image(self): with self.lock: if self.frame is not None: _, buffer = cv2.imencode('.jpeg', self.frame) return base64.b64encode(buffer).decode('utf-8') return None ctx = webrtc_streamer(key="example", video_processor_factory=VideoProcessor) st.subheader("Ask the Assistant") prompt = st.text_input("Enter your question:") if ctx.video_processor: if st.button("Submit"): base64_image = ctx.video_processor.get_base64_image() if prompt and base64_image: assistant.answer(prompt, base64_image) else: st.warning("Please enter a prompt and ensure webcam feed is available.") if __name__ == "__main__": main()