Spaces:

awacke1
/

Scholarly-Article-VoiceChat

Build error

App Files Files Community

awacke1 commited on Jun 27, 2024

Commit

0c09b53

verified ·

1 Parent(s): 8772f59

Create app.py

Browse files

Files changed (1) hide show

app.py +163 -0

app.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import streamlit as st
+import base64
+from threading import Lock, Thread
+import cv2
+import openai
+from dotenv import load_dotenv
+from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain.schema.messages import SystemMessage
+from langchain_community.chat_message_histories import ChatMessageHistory
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables.history import RunnableWithMessageHistory
+from langchain_openai import ChatOpenAI
+from langchain_google_genai import ChatGoogleGenerativeAI
+import pyaudio
+import speech_recognition as sr
+import os
+import json
+import random
+import requests
+import time
+import zipfile
+from PIL import Image
+from urllib.parse import quote
+load_dotenv()
+class WebcamStream:
+    def __init__(self):
+        self.stream = cv2.VideoCapture(0)
+        _, self.frame = self.stream.read()
+        self.running = False
+        self.lock = Lock()
+    def start(self):
+        if self.running:
+            return self
+        self.running = True
+        self.thread = Thread(target=self.update, args=())
+        self.thread.start()
+        return self
+    def update(self):
+        while self.running:
+            _, frame = self.stream.read()
+            self.lock.acquire()
+            self.frame = frame
+            self.lock.release()
+    def read(self, encode=False):
+        self.lock.acquire()
+        frame = self.frame.copy()
+        self.lock.release()
+        if encode:
+            _, buffer = cv2.imencode(".jpeg", frame)
+            return base64.b64encode(buffer)
+        return frame
+    def stop(self):
+        self.running = False
+        if self.thread.is_alive():
+            self.thread.join()
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.stream.release()
+class Assistant:
+    def __init__(self, model):
+        self.chain = self._create_inference_chain(model)
+    def answer(self, prompt, image):
+        if not prompt:
+            return
+        st.write("Prompt:", prompt)
+        response = self.chain.invoke(
+            {"prompt": prompt, "image_base64": image.decode()},
+            config={"configurable": {"session_id": "unused"}},
+        ).strip()
+        st.write("Response:", response)
+        if response:
+            self._tts(response)
+    def _tts(self, response):
+        player = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
+        with openai.audio.speech.with_streaming_response.create(
+            model="tts-1",
+            voice="alloy",
+            response_format="pcm",
+            input=response,
+        ) as stream:
+            for chunk in stream.iter_bytes(chunk_size=1024):
+                player.write(chunk)
+    def _create_inference_chain(self, model):
+        SYSTEM_PROMPT = """
+        You are a witty assistant that will use the chat history and the image
+        provided by the user to answer its questions.
+        Use few words on your answers. Go straight to the point. Do not use any
+        emoticons or emojis. Do not ask the user any questions.
+        Be friendly and helpful. Show some personality. Do not be too formal.
+        """
+        prompt_template = ChatPromptTemplate.from_messages([
+            SystemMessage(content=SYSTEM_PROMPT),
+            MessagesPlaceholder(variable_name="chat_history"),
+            ("human", [
+                {"type": "text", "text": "{prompt}"},
+                {"type": "image_url", "image_url": "data:image/jpeg;base64,{image_base64}"},
+            ]),
+        ])
+        chain = prompt_template | model | StrOutputParser()
+        chat_message_history = ChatMessageHistory()
+        return RunnableWithMessageHistory(
+            chain,
+            lambda _: chat_message_history,
+            input_messages_key="prompt",
+            history_messages_key="chat_history",
+        )
+def main():
+    st.title("🚀🌌 Scholarly Article Document Search with Memory")
+    webcam_stream = WebcamStream().start()
+    model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
+    assistant = Assistant(model)
+    # Sidebar
+    st.sidebar.title("Options")
+    should_save = st.sidebar.checkbox("💾 Save", value=True, help="Save your session data.")
+    # Main content
+    query = st.text_input("Enter your search query:")
+    if st.button("Search"):
+        image = webcam_stream.read(encode=True)
+        assistant.answer(query, image)
+    # File upload
+    uploaded_file = st.file_uploader("Upload a file for context", type=["txt", "pdf", "docx"])
+    if uploaded_file:
+        file_contents = uploaded_file.read()
+        st.write("File contents:", file_contents)
+    # Display webcam feed
+    st.image(webcam_stream.read(), channels="BGR")
+    # Audio recording
+    if st.button("Record Audio"):
+        r = sr.Recognizer()
+        with sr.Microphone() as source:
+            st.write("Speak now...")
+            audio = r.listen(source)
+            st.write("Processing audio...")
+        try:
+            text = r.recognize_google(audio)
+            st.write("You said:", text)
+            assistant.answer(text, webcam_stream.read(encode=True))
+        except sr.UnknownValueError:
+            st.write("Could not understand audio")
+        except sr.RequestError as e:
+            st.write("Could not request results; {0}".format(e))
+    webcam_stream.stop()
+if __name__ == "__main__":
+    main()