Spaces:

awacke1
/

StreamlitSpeechAssistTest

Sleeping

App Files Files Community

awacke1 commited on Dec 3, 2024

Commit

b8750fa

verified ·

1 Parent(s): 1cc6fa9

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -17

app.py CHANGED Viewed

@@ -1,6 +1,11 @@
 import base64
 import cv2
 import openai
 from dotenv import load_dotenv
 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain.schema.messages import SystemMessage
@@ -8,38 +13,162 @@ from langchain_community.chat_message_histories import ChatMessageHistory
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables.history import RunnableWithMessageHistory
 from langchain_openai import ChatOpenAI
-import streamlit as st
 load_dotenv()
-class VideoStream:
     def __init__(self):
-        self.cap = cv2.VideoCapture(0)
-    def get_frame(self):
-        ret, frame = self.cap.read()
-        encoded_image = cv2.imencode('.jpg', frame)[1].tobytes()
-        return encoded_image
 class Assistant:
-    # ... (Same code as before for the Assistant class)
 def main():
-    st.title("AI Assistant App")
-    video_stream = VideoStream()
-    # model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
-    model = ChatOpenAI(model="gpt-4o")  # Using OpenAI's GPT-4 model
     assistant = Assistant(model)
-    if st.button("Start"):
         while True:
-            frame = video_stream.get_frame()
-            st.image(frame, channels="BGR")
-            # Add code to capture audio input and process the response here
-            # ...
 if __name__ == "__main__":
     main()

 import base64
+from threading import Lock, Thread
 import cv2
 import openai
+import sounddevice as sd
+import streamlit as st
+from cv2 import VideoCapture, imencode
 from dotenv import load_dotenv
 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain.schema.messages import SystemMessage
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables.history import RunnableWithMessageHistory
 from langchain_openai import ChatOpenAI
+from scipy.io.wavfile import write
+from speech_recognition import Recognizer, UnknownValueError, AudioData
 load_dotenv()
+class WebcamStream:
     def __init__(self):
+        self.stream = VideoCapture(index=0)
+        _, self.frame = self.stream.read()
+        self.running = False
+        self.lock = Lock()
+    def start(self):
+        if self.running:
+            return self
+        self.running = True
+        self.thread = Thread(target=self.update, args=())
+        self.thread.start()
+        return self
+    def update(self):
+        while self.running:
+            _, frame = self.stream.read()
+            self.lock.acquire()
+            self.frame = frame
+            self.lock.release()
+    def read(self, encode=False):
+        self.lock.acquire()
+        frame = self.frame.copy()
+        self.lock.release()
+        if encode:
+            _, buffer = imencode(".jpeg", frame)
+            return base64.b64encode(buffer)
+        return frame
+    def stop(self):
+        self.running = False
+        if self.thread.is_alive():
+            self.thread.join()
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.stream.release()
 class Assistant:
+    def __init__(self, model):
+        self.chain = self._create_inference_chain(model)
+    def answer(self, prompt, image):
+        if not prompt:
+            return
+        print("Prompt:", prompt)
+        response = self.chain.invoke(
+            {"prompt": prompt, "image_base64": image.decode()},
+            config={"configurable": {"session_id": "unused"}},
+        ).strip()
+        print("Response:", response)
+        if response:
+            self._tts(response)
+    def _tts(self, response):
+        # Simulate TTS: normally you'd use a library or API here
+        print(f"TTS: {response}")
+    def _create_inference_chain(self, model):
+        SYSTEM_PROMPT = """
+        You are a witty assistant that will use the chat history and the image
+        provided by the user to answer its questions. Your job is to answer
+        questions.
+        Use few words on your answers. Go straight to the point. Do not use any
+        emoticons or emojis.
+        Be friendly and helpful. Show some personality.
+        """
+        prompt_template = ChatPromptTemplate.from_messages(
+            [
+                SystemMessage(content=SYSTEM_PROMPT),
+                MessagesPlaceholder(variable_name="chat_history"),
+                (
+                    "human",
+                    [
+                        {"type": "text", "text": "{prompt}"},
+                        {
+                            "type": "image_url",
+                            "image_url": "data:image/jpeg;base64,{image_base64}",
+                        },
+                    ],
+                ),
+            ]
+        )
+        chain = prompt_template | model | StrOutputParser()
+        chat_message_history = ChatMessageHistory()
+        return RunnableWithMessageHistory(
+            chain,
+            lambda _: chat_message_history,
+            input_messages_key="prompt",
+            history_messages_key="chat_history",
+        )
 def main():
+    st.title("AI Assistant with Webcam Stream")
+    # Instantiate Webcam Stream and start it
+    webcam_stream = WebcamStream().start()
+    # model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
+    # You can use OpenAI's GPT-4o model instead of Gemini Flash by uncommenting the following line:
+    model = ChatOpenAI(model="gpt-4o")
     assistant = Assistant(model)
+    # UI for webcam feed
+    st.subheader("Webcam Feed")
+    def run_webcam():
         while True:
+            frame = webcam_stream.read()
+            _, buffer = cv2.imencode('.jpg', frame)
+            frame_data = base64.b64encode(buffer).decode('utf-8')
+            # Display frame in Streamlit app
+            st.image(f"data:image/jpeg;base64,{frame_data}", use_column_width=True)
+            st.experimental_rerun()
+    webcam_thread = Thread(target=run_webcam)
+    webcam_thread.start()
+    st.subheader("Ask the Assistant")
+    prompt = st.text_input("Enter your question:")
+    if st.button("Submit"):
+        if prompt:
+            assistant.answer(prompt, webcam_stream.read(encode=True))
+        else:
+            st.warning("Please enter a prompt to submit.")
+    if st.button("Stop Webcam"):
+        webcam_stream.stop()
+        cv2.destroyAllWindows()
 if __name__ == "__main__":
     main()