Spaces:

awacke1
/

StreamlitSpeechAssistTest

Sleeping

App Files Files Community

StreamlitSpeechAssistTest / app.py

awacke1

Update app.py

0729bfc verified 7 months ago

raw

history blame

3.84 kB

	import base64
	import cv2
	import streamlit as st
	import openai
	from threading import Lock, Thread
	from cv2 import VideoCapture, imencode
	from dotenv import load_dotenv
	from playsound import playsound
	from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
	from langchain.schema.messages import SystemMessage
	from langchain_community.chat_message_histories import ChatMessageHistory
	from langchain_core.output_parsers import StrOutputParser
	from langchain_core.runnables.history import RunnableWithMessageHistory
	from langchain_openai import ChatOpenAI
	from streamlit_webrtc import webrtc_streamer, VideoTransformerBase

	load_dotenv()

	class Assistant:
	def __init__(self, model):
	self.chain = self._create_inference_chain(model)

	def answer(self, prompt, image):
	if not prompt:
	return

	print("Prompt:", prompt)

	response = self.chain.invoke(
	{"prompt": prompt, "image_base64": image},
	config={"configurable": {"session_id": "unused"}},
	).strip()

	print("Response:", response)

	if response:
	self._tts(response)

	def _tts(self, response):
	# Simple TTS simulation: Print to the console.
	print(f"TTS Response: {response}")

	def _create_inference_chain(self, model):
	SYSTEM_PROMPT = """
	You are a witty assistant that will use the chat history and the image
	provided by the user to answer its questions. Your job is to answer
	questions.

	Use few words on your answers. Go straight to the point. Do not use any
	emoticons or emojis.

	Be friendly and helpful. Show some personality.
	"""

	prompt_template = ChatPromptTemplate.from_messages(
	[
	SystemMessage(content=SYSTEM_PROMPT),
	MessagesPlaceholder(variable_name="chat_history"),
	("human", [
	{"type": "text", "text": "{prompt}"},
	{"type": "image_url", "image_url": "data:image/jpeg;base64,{image_base64}"}
	]),
	]
	)

	chain = prompt_template \| model \| StrOutputParser()

	chat_message_history = ChatMessageHistory()
	return RunnableWithMessageHistory(
	chain,
	lambda _: chat_message_history,
	input_messages_key="prompt",
	history_messages_key="chat_history",
	)

	def main():
	st.title("AI Assistant with Webcam Stream")

	# model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
	# You can use OpenAI's GPT-4o model instead of Gemini Flash by uncommenting the following line:
	model = ChatOpenAI(model="gpt-4o")
	assistant = Assistant(model)

	class VideoProcessor(VideoTransformerBase):
	def __init__(self):
	self.lock = Lock()
	self.frame = None

	def transform(self, frame):
	with self.lock:
	self.frame = frame.to_ndarray(format="bgr24")

	return frame

	def get_base64_image(self):
	with self.lock:
	if self.frame is not None:
	_, buffer = cv2.imencode('.jpeg', self.frame)
	return base64.b64encode(buffer).decode('utf-8')
	return None

	ctx = webrtc_streamer(key="example", video_processor_factory=VideoProcessor)
	st.subheader("Ask the Assistant")

	prompt = st.text_input("Enter your question:")

	if ctx.video_processor:
	if st.button("Submit"):
	base64_image = ctx.video_processor.get_base64_image()
	if prompt and base64_image:
	assistant.answer(prompt, base64_image)
	else:
	st.warning("Please enter a prompt and ensure webcam feed is available.")

	if __name__ == "__main__":
	main()