awacke1 commited on
Commit
b8750fa
·
verified ·
1 Parent(s): 1cc6fa9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -17
app.py CHANGED
@@ -1,6 +1,11 @@
1
  import base64
 
 
2
  import cv2
3
  import openai
 
 
 
4
  from dotenv import load_dotenv
5
  from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
6
  from langchain.schema.messages import SystemMessage
@@ -8,38 +13,162 @@ from langchain_community.chat_message_histories import ChatMessageHistory
8
  from langchain_core.output_parsers import StrOutputParser
9
  from langchain_core.runnables.history import RunnableWithMessageHistory
10
  from langchain_openai import ChatOpenAI
11
- import streamlit as st
 
12
 
13
  load_dotenv()
14
 
15
- class VideoStream:
 
16
  def __init__(self):
17
- self.cap = cv2.VideoCapture(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- def get_frame(self):
20
- ret, frame = self.cap.read()
21
- encoded_image = cv2.imencode('.jpg', frame)[1].tobytes()
22
- return encoded_image
23
 
24
  class Assistant:
25
- # ... (Same code as before for the Assistant class)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  def main():
28
- st.title("AI Assistant App")
29
- video_stream = VideoStream()
30
 
31
- # model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
32
- model = ChatOpenAI(model="gpt-4o") # Using OpenAI's GPT-4 model
33
 
 
 
 
34
  assistant = Assistant(model)
35
 
36
- if st.button("Start"):
 
 
 
37
  while True:
38
- frame = video_stream.get_frame()
39
- st.image(frame, channels="BGR")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- # Add code to capture audio input and process the response here
42
- # ...
43
 
44
  if __name__ == "__main__":
45
  main()
 
1
  import base64
2
+ from threading import Lock, Thread
3
+
4
  import cv2
5
  import openai
6
+ import sounddevice as sd
7
+ import streamlit as st
8
+ from cv2 import VideoCapture, imencode
9
  from dotenv import load_dotenv
10
  from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
11
  from langchain.schema.messages import SystemMessage
 
13
  from langchain_core.output_parsers import StrOutputParser
14
  from langchain_core.runnables.history import RunnableWithMessageHistory
15
  from langchain_openai import ChatOpenAI
16
+ from scipy.io.wavfile import write
17
+ from speech_recognition import Recognizer, UnknownValueError, AudioData
18
 
19
  load_dotenv()
20
 
21
+
22
+ class WebcamStream:
23
  def __init__(self):
24
+ self.stream = VideoCapture(index=0)
25
+ _, self.frame = self.stream.read()
26
+ self.running = False
27
+ self.lock = Lock()
28
+
29
+ def start(self):
30
+ if self.running:
31
+ return self
32
+
33
+ self.running = True
34
+
35
+ self.thread = Thread(target=self.update, args=())
36
+ self.thread.start()
37
+ return self
38
+
39
+ def update(self):
40
+ while self.running:
41
+ _, frame = self.stream.read()
42
+
43
+ self.lock.acquire()
44
+ self.frame = frame
45
+ self.lock.release()
46
+
47
+ def read(self, encode=False):
48
+ self.lock.acquire()
49
+ frame = self.frame.copy()
50
+ self.lock.release()
51
+
52
+ if encode:
53
+ _, buffer = imencode(".jpeg", frame)
54
+ return base64.b64encode(buffer)
55
+
56
+ return frame
57
+
58
+ def stop(self):
59
+ self.running = False
60
+ if self.thread.is_alive():
61
+ self.thread.join()
62
+
63
+ def __exit__(self, exc_type, exc_value, exc_traceback):
64
+ self.stream.release()
65
 
 
 
 
 
66
 
67
  class Assistant:
68
+ def __init__(self, model):
69
+ self.chain = self._create_inference_chain(model)
70
+
71
+ def answer(self, prompt, image):
72
+ if not prompt:
73
+ return
74
+
75
+ print("Prompt:", prompt)
76
+
77
+ response = self.chain.invoke(
78
+ {"prompt": prompt, "image_base64": image.decode()},
79
+ config={"configurable": {"session_id": "unused"}},
80
+ ).strip()
81
+
82
+ print("Response:", response)
83
+
84
+ if response:
85
+ self._tts(response)
86
+
87
+ def _tts(self, response):
88
+ # Simulate TTS: normally you'd use a library or API here
89
+ print(f"TTS: {response}")
90
+
91
+ def _create_inference_chain(self, model):
92
+ SYSTEM_PROMPT = """
93
+ You are a witty assistant that will use the chat history and the image
94
+ provided by the user to answer its questions. Your job is to answer
95
+ questions.
96
+
97
+ Use few words on your answers. Go straight to the point. Do not use any
98
+ emoticons or emojis.
99
+
100
+ Be friendly and helpful. Show some personality.
101
+ """
102
+
103
+ prompt_template = ChatPromptTemplate.from_messages(
104
+ [
105
+ SystemMessage(content=SYSTEM_PROMPT),
106
+ MessagesPlaceholder(variable_name="chat_history"),
107
+ (
108
+ "human",
109
+ [
110
+ {"type": "text", "text": "{prompt}"},
111
+ {
112
+ "type": "image_url",
113
+ "image_url": "data:image/jpeg;base64,{image_base64}",
114
+ },
115
+ ],
116
+ ),
117
+ ]
118
+ )
119
+
120
+ chain = prompt_template | model | StrOutputParser()
121
+
122
+ chat_message_history = ChatMessageHistory()
123
+ return RunnableWithMessageHistory(
124
+ chain,
125
+ lambda _: chat_message_history,
126
+ input_messages_key="prompt",
127
+ history_messages_key="chat_history",
128
+ )
129
+
130
 
131
  def main():
132
+ st.title("AI Assistant with Webcam Stream")
 
133
 
134
+ # Instantiate Webcam Stream and start it
135
+ webcam_stream = WebcamStream().start()
136
 
137
+ # model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
138
+ # You can use OpenAI's GPT-4o model instead of Gemini Flash by uncommenting the following line:
139
+ model = ChatOpenAI(model="gpt-4o")
140
  assistant = Assistant(model)
141
 
142
+ # UI for webcam feed
143
+ st.subheader("Webcam Feed")
144
+
145
+ def run_webcam():
146
  while True:
147
+ frame = webcam_stream.read()
148
+ _, buffer = cv2.imencode('.jpg', frame)
149
+ frame_data = base64.b64encode(buffer).decode('utf-8')
150
+
151
+ # Display frame in Streamlit app
152
+ st.image(f"data:image/jpeg;base64,{frame_data}", use_column_width=True)
153
+ st.experimental_rerun()
154
+
155
+ webcam_thread = Thread(target=run_webcam)
156
+ webcam_thread.start()
157
+
158
+ st.subheader("Ask the Assistant")
159
+
160
+ prompt = st.text_input("Enter your question:")
161
+
162
+ if st.button("Submit"):
163
+ if prompt:
164
+ assistant.answer(prompt, webcam_stream.read(encode=True))
165
+ else:
166
+ st.warning("Please enter a prompt to submit.")
167
+
168
+ if st.button("Stop Webcam"):
169
+ webcam_stream.stop()
170
+ cv2.destroyAllWindows()
171
 
 
 
172
 
173
  if __name__ == "__main__":
174
  main()