Spaces:
Sleeping
Sleeping
import base64 | |
import cv2 | |
import streamlit as st | |
import openai | |
from threading import Lock, Thread | |
from cv2 import VideoCapture, imencode | |
from dotenv import load_dotenv | |
from playsound import playsound | |
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder | |
from langchain.schema.messages import SystemMessage | |
from langchain_community.chat_message_histories import ChatMessageHistory | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain_core.runnables.history import RunnableWithMessageHistory | |
from langchain_openai import ChatOpenAI | |
from streamlit_webrtc import webrtc_streamer, VideoTransformerBase | |
load_dotenv() | |
class Assistant: | |
def __init__(self, model): | |
self.chain = self._create_inference_chain(model) | |
def answer(self, prompt, image): | |
if not prompt: | |
return | |
print("Prompt:", prompt) | |
response = self.chain.invoke( | |
{"prompt": prompt, "image_base64": image}, | |
config={"configurable": {"session_id": "unused"}}, | |
).strip() | |
print("Response:", response) | |
if response: | |
self._tts(response) | |
def _tts(self, response): | |
# Simple TTS simulation: Print to the console. | |
print(f"TTS Response: {response}") | |
def _create_inference_chain(self, model): | |
SYSTEM_PROMPT = """ | |
You are a witty assistant that will use the chat history and the image | |
provided by the user to answer its questions. Your job is to answer | |
questions. | |
Use few words on your answers. Go straight to the point. Do not use any | |
emoticons or emojis. | |
Be friendly and helpful. Show some personality. | |
""" | |
prompt_template = ChatPromptTemplate.from_messages( | |
[ | |
SystemMessage(content=SYSTEM_PROMPT), | |
MessagesPlaceholder(variable_name="chat_history"), | |
("human", [ | |
{"type": "text", "text": "{prompt}"}, | |
{"type": "image_url", "image_url": "data:image/jpeg;base64,{image_base64}"} | |
]), | |
] | |
) | |
chain = prompt_template | model | StrOutputParser() | |
chat_message_history = ChatMessageHistory() | |
return RunnableWithMessageHistory( | |
chain, | |
lambda _: chat_message_history, | |
input_messages_key="prompt", | |
history_messages_key="chat_history", | |
) | |
def main(): | |
st.title("AI Assistant with Webcam Stream") | |
# model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest") | |
# You can use OpenAI's GPT-4o model instead of Gemini Flash by uncommenting the following line: | |
model = ChatOpenAI(model="gpt-4o") | |
assistant = Assistant(model) | |
class VideoProcessor(VideoTransformerBase): | |
def __init__(self): | |
self.lock = Lock() | |
self.frame = None | |
def transform(self, frame): | |
with self.lock: | |
self.frame = frame.to_ndarray(format="bgr24") | |
return frame | |
def get_base64_image(self): | |
with self.lock: | |
if self.frame is not None: | |
_, buffer = cv2.imencode('.jpeg', self.frame) | |
return base64.b64encode(buffer).decode('utf-8') | |
return None | |
ctx = webrtc_streamer(key="example", video_processor_factory=VideoProcessor) | |
st.subheader("Ask the Assistant") | |
prompt = st.text_input("Enter your question:") | |
if ctx.video_processor: | |
if st.button("Submit"): | |
base64_image = ctx.video_processor.get_base64_image() | |
if prompt and base64_image: | |
assistant.answer(prompt, base64_image) | |
else: | |
st.warning("Please enter a prompt and ensure webcam feed is available.") | |
if __name__ == "__main__": | |
main() |