Spaces:

awacke1
/

StreamlitSpeechAssistTest

Sleeping

App Files Files Community

awacke1 commited on Dec 3, 2024

Commit

1cc6fa9

verified ·

1 Parent(s): 8c6351b

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -148

app.py CHANGED Viewed

@@ -1,10 +1,6 @@
 import base64
-from threading import Lock, Thread
 import cv2
 import openai
-import streamlit as st
-from cv2 import VideoCapture, imencode
 from dotenv import load_dotenv
 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain.schema.messages import SystemMessage
@@ -12,165 +8,38 @@ from langchain_community.chat_message_histories import ChatMessageHistory
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables.history import RunnableWithMessageHistory
 from langchain_openai import ChatOpenAI
-from pyaudio import PyAudio, paInt16
-from speech_recognition import Microphone, Recognizer, UnknownValueError
 load_dotenv()
-class WebcamStream:
     def __init__(self):
-        self.stream = VideoCapture(index=0)
-        _, self.frame = self.stream.read()
-        self.running = False
-        self.lock = Lock()
-    def start(self):
-        if self.running:
-            return self
-        self.running = True
-        self.thread = Thread(target=self.update, args=())
-        self.thread.start()
-        return self
-    def update(self):
-        while self.running:
-            _, frame = self.stream.read()
-            self.lock.acquire()
-            self.frame = frame
-            self.lock.release()
-    def read(self, encode=False):
-        self.lock.acquire()
-        frame = self.frame.copy()
-        self.lock.release()
-        if encode:
-            _, buffer = imencode(".jpeg", frame)
-            return base64.b64encode(buffer)
-        return frame
-    def stop(self):
-        self.running = False
-        if self.thread.is_alive():
-            self.thread.join()
-    def __exit__(self, exc_type, exc_value, exc_traceback):
-        self.stream.release()
 class Assistant:
-    def __init__(self, model):
-        self.chain = self._create_inference_chain(model)
-    def answer(self, prompt, image):
-        if not prompt:
-            return
-        response = self.chain.invoke(
-            {"prompt": prompt, "image_base64": image.decode()},
-            config={"configurable": {"session_id": "unused"}},
-        ).strip()
-        if response:
-            self._tts(response)
-    def _tts(self, response):
-        player = PyAudio().open(format=paInt16, channels=1, rate=24000, output=True)
-        with openai.audio.speech.with_streaming_response.create(
-            model="tts-1",
-            voice="alloy",
-            response_format="pcm",
-            input=response,
-        ) as stream:
-            for chunk in stream.iter_bytes(chunk_size=1024):
-                player.write(chunk)
-    def _create_inference_chain(self, model):
-        SYSTEM_PROMPT = """
-        You are a witty assistant that will use the chat history and the image
-        provided by the user to answer its questions. Your job is to answer
-        questions.
-        Use few words on your answers. Go straight to the point. Do not use any
-        emoticons or emojis.
-        Be friendly and helpful. Show some personality.
-        """
-        prompt_template = ChatPromptTemplate.from_messages(
-            [
-                SystemMessage(content=SYSTEM_PROMPT),
-                MessagesPlaceholder(variable_name="chat_history"),
-                (
-                    "human",
-                    [
-                        {"type": "text", "text": "{prompt}"},
-                        {
-                            "type": "image_url",
-                            "image_url": "data:image/jpeg;base64,{image_base64}",
-                        },
-                    ],
-                ),
-            ]
-        )
-        chain = prompt_template | model | StrOutputParser()
-        chat_message_history = ChatMessageHistory()
-        return RunnableWithMessageHistory(
-            chain,
-            lambda _: chat_message_history,
-            input_messages_key="prompt",
-            history_messages_key="chat_history",
-        )
 def main():
-    st.title("AI Assistant with Webcam Stream")
-    # Instantiate Webcam Stream and start it
-    webcam_stream = WebcamStream().start()
     # model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
-    # You can use OpenAI's GPT-4o model instead of Gemini Flash by uncommenting the following line:
-    model = ChatOpenAI(model="gpt-4o")
-    assistant = Assistant(model)
-    # UI for webcam feed
-    st_subtitle("Webcam Feed")
-    def run_webcam():
         while True:
-            frame = webcam_stream.read()
-            _, buffer = cv2.imencode('.jpg', frame)
-            frame_data = base64.b64encode(buffer).decode('utf-8')
-            # Display frame in Streamlit app
-            st.image(f"data:image/jpeg;base64,{frame_data}", use_column_width=True)
-            st.experimental_rerun()
-    webcam_thread = Thread(target=run_webcam)
-    webcam_thread.start()
-    st.subheader("Ask the Assistant")
-    prompt = st.text_input("Enter your question:")
-    if st.button("Submit"):
-        if prompt:
-            assistant.answer(prompt, webcam_stream.read(encode=True))
-        else:
-            st.warning("Please enter a prompt to submit.")
-    if st.button("Stop Webcam"):
-        webcam_stream.stop()
-        cv2.destroyAllWindows()
 if __name__ == "__main__":
     main()

 import base64
 import cv2
 import openai
 from dotenv import load_dotenv
 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain.schema.messages import SystemMessage
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables.history import RunnableWithMessageHistory
 from langchain_openai import ChatOpenAI
+import streamlit as st
 load_dotenv()
+class VideoStream:
     def __init__(self):
+        self.cap = cv2.VideoCapture(0)
+    def get_frame(self):
+        ret, frame = self.cap.read()
+        encoded_image = cv2.imencode('.jpg', frame)[1].tobytes()
+        return encoded_image
 class Assistant:
+    # ... (Same code as before for the Assistant class)
 def main():
+    st.title("AI Assistant App")
+    video_stream = VideoStream()
     # model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
+    model = ChatOpenAI(model="gpt-4o")  # Using OpenAI's GPT-4 model
+    assistant = Assistant(model)
+    if st.button("Start"):
         while True:
+            frame = video_stream.get_frame()
+            st.image(frame, channels="BGR")
+            # Add code to capture audio input and process the response here
+            # ...
 if __name__ == "__main__":
     main()