Spaces:
Sleeping
Sleeping
import base64 | |
import cv2 | |
import openai | |
import streamlit as st | |
from dotenv import load_dotenv | |
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder | |
from langchain.schema.messages import SystemMessage | |
from langchain_community.chat_message_histories import ChatMessageHistory | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain_core.runnables.history import RunnableWithMessageHistory | |
from langchain_openai import ChatOpenAI | |
from pyaudio import PyAudio, paInt16 | |
from speech_recognition import Microphone, Recognizer, UnknownValueError | |
load_dotenv() | |
class WebcamStream: | |
def __init__(self): | |
self.stream = cv2.VideoCapture(index=0) | |
_, self.frame = self.stream.read() | |
self.running = False | |
def start(self): | |
self.running = True | |
return self | |
def update(self): | |
while self.running: | |
_, frame = self.stream.read() | |
_, buffer = cv2.imencode(".jpeg", frame) | |
self.frame = base64.b64encode(buffer).decode() | |
def read(self): | |
return self.frame | |
def stop(self): | |
self.running = False | |
self.stream.release() | |
class Assistant: | |
def __init__(self, model): | |
self.chain = self._create_inference_chain(model) | |
def answer(self, prompt, image): | |
if not prompt: | |
return | |
print("Prompt:", prompt) | |
response = self.chain.invoke( | |
{"prompt": prompt, "image_base64": image}, | |
config={"configurable": {"session_id": "unused"}}, | |
).strip() | |
print("Response:", response) | |
if response: | |
self._tts(response) | |
def _tts(self, response): | |
player = PyAudio().open(format=paInt16, channels=1, rate=24000, output=True) | |
with openai.audio.speech.with_streaming_response.create( | |
model="tts-1", | |
voice="alloy", | |
response_format="pcm", | |
input=response, | |
) as stream: | |
for chunk in stream.iter_bytes(chunk_size=1024): | |
player.write(chunk) | |
def _create_inference_chain(self, model): | |
SYSTEM_PROMPT = """ | |
You are a witty assistant that will use the chat history and the image | |
provided by the user to answer its questions. Your job is to answer | |
questions. | |
Use few words on your answers. Go straight to the point. Do not use any | |
emoticons or emojis. | |
Be friendly and helpful. Show some personality. | |
""" | |
prompt_template = ChatPromptTemplate.from_messages( | |
[ | |
SystemMessage(content=SYSTEM_PROMPT), | |
MessagesPlaceholder(variable_name="chat_history"), | |
( | |
"human", | |
[ | |
{"type": "text", "text": "{prompt}"}, | |
{ | |
"type": "image_url", | |
"image_url": "data:image/jpeg;base64,{image_base64}", | |
}, | |
], | |
), | |
] | |
) | |
chain = prompt_template | model | StrOutputParser() | |
chat_message_history = ChatMessageHistory() | |
return RunnableWithMessageHistory( | |
chain, | |
lambda _: chat_message_history, | |
input_messages_key="prompt", | |
history_messages_key="chat_history", | |
) | |
def audio_callback(recognizer, audio): | |
try: | |
prompt = recognizer.recognize_whisper(audio, model="base", language="english") | |
assistant.answer(prompt, webcam_stream.read()) | |
except UnknownValueError: | |
print("There was an error processing the audio.") | |
def main(): | |
st.title("AI Assistant") | |
webcam_stream = WebcamStream() |