awacke1 commited on
Commit
0729bfc
·
verified ·
1 Parent(s): 18d2f9f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -91
app.py CHANGED
@@ -1,69 +1,21 @@
1
  import base64
2
- from threading import Lock, Thread
3
-
4
  import cv2
5
- import openai
6
- import sounddevice as sd
7
  import streamlit as st
 
 
8
  from cv2 import VideoCapture, imencode
9
  from dotenv import load_dotenv
 
10
  from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
11
  from langchain.schema.messages import SystemMessage
12
  from langchain_community.chat_message_histories import ChatMessageHistory
13
  from langchain_core.output_parsers import StrOutputParser
14
  from langchain_core.runnables.history import RunnableWithMessageHistory
15
  from langchain_openai import ChatOpenAI
16
- from scipy.io.wavfile import write
17
- from speech_recognition import Recognizer, UnknownValueError, AudioData
18
 
19
  load_dotenv()
20
 
21
-
22
- class WebcamStream:
23
- def __init__(self):
24
- self.stream = VideoCapture(index=0)
25
- _, self.frame = self.stream.read()
26
- self.running = False
27
- self.lock = Lock()
28
-
29
- def start(self):
30
- if self.running:
31
- return self
32
-
33
- self.running = True
34
-
35
- self.thread = Thread(target=self.update, args=())
36
- self.thread.start()
37
- return self
38
-
39
- def update(self):
40
- while self.running:
41
- _, frame = self.stream.read()
42
-
43
- self.lock.acquire()
44
- self.frame = frame
45
- self.lock.release()
46
-
47
- def read(self, encode=False):
48
- self.lock.acquire()
49
- frame = self.frame.copy()
50
- self.lock.release()
51
-
52
- if encode:
53
- _, buffer = imencode(".jpeg", frame)
54
- return base64.b64encode(buffer)
55
-
56
- return frame
57
-
58
- def stop(self):
59
- self.running = False
60
- if self.thread.is_alive():
61
- self.thread.join()
62
-
63
- def __exit__(self, exc_type, exc_value, exc_traceback):
64
- self.stream.release()
65
-
66
-
67
  class Assistant:
68
  def __init__(self, model):
69
  self.chain = self._create_inference_chain(model)
@@ -75,7 +27,7 @@ class Assistant:
75
  print("Prompt:", prompt)
76
 
77
  response = self.chain.invoke(
78
- {"prompt": prompt, "image_base64": image.decode()},
79
  config={"configurable": {"session_id": "unused"}},
80
  ).strip()
81
 
@@ -85,8 +37,8 @@ class Assistant:
85
  self._tts(response)
86
 
87
  def _tts(self, response):
88
- # Simulate TTS: normally you'd use a library or API here
89
- print(f"TTS: {response}")
90
 
91
  def _create_inference_chain(self, model):
92
  SYSTEM_PROMPT = """
@@ -104,16 +56,10 @@ class Assistant:
104
  [
105
  SystemMessage(content=SYSTEM_PROMPT),
106
  MessagesPlaceholder(variable_name="chat_history"),
107
- (
108
- "human",
109
- [
110
- {"type": "text", "text": "{prompt}"},
111
- {
112
- "type": "image_url",
113
- "image_url": "data:image/jpeg;base64,{image_base64}",
114
- },
115
- ],
116
- ),
117
  ]
118
  )
119
 
@@ -127,48 +73,44 @@ class Assistant:
127
  history_messages_key="chat_history",
128
  )
129
 
130
-
131
  def main():
132
  st.title("AI Assistant with Webcam Stream")
133
 
134
- # Instantiate Webcam Stream and start it
135
- webcam_stream = WebcamStream().start()
136
-
137
  # model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
138
  # You can use OpenAI's GPT-4o model instead of Gemini Flash by uncommenting the following line:
139
  model = ChatOpenAI(model="gpt-4o")
140
  assistant = Assistant(model)
141
 
142
- # UI for webcam feed
143
- st.subheader("Webcam Feed")
 
 
144
 
145
- def run_webcam():
146
- while True:
147
- frame = webcam_stream.read()
148
- _, buffer = cv2.imencode('.jpg', frame)
149
- frame_data = base64.b64encode(buffer).decode('utf-8')
150
 
151
- # Display frame in Streamlit app
152
- st.image(f"data:image/jpeg;base64,{frame_data}", use_column_width=True)
153
- st.experimental_rerun()
154
 
155
- webcam_thread = Thread(target=run_webcam)
156
- webcam_thread.start()
 
 
 
 
157
 
 
158
  st.subheader("Ask the Assistant")
159
 
160
  prompt = st.text_input("Enter your question:")
161
 
162
- if st.button("Submit"):
163
- if prompt:
164
- assistant.answer(prompt, webcam_stream.read(encode=True))
165
- else:
166
- st.warning("Please enter a prompt to submit.")
167
-
168
- if st.button("Stop Webcam"):
169
- webcam_stream.stop()
170
- cv2.destroyAllWindows()
171
-
172
 
173
  if __name__ == "__main__":
174
  main()
 
1
  import base64
 
 
2
  import cv2
 
 
3
  import streamlit as st
4
+ import openai
5
+ from threading import Lock, Thread
6
  from cv2 import VideoCapture, imencode
7
  from dotenv import load_dotenv
8
+ from playsound import playsound
9
  from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
10
  from langchain.schema.messages import SystemMessage
11
  from langchain_community.chat_message_histories import ChatMessageHistory
12
  from langchain_core.output_parsers import StrOutputParser
13
  from langchain_core.runnables.history import RunnableWithMessageHistory
14
  from langchain_openai import ChatOpenAI
15
+ from streamlit_webrtc import webrtc_streamer, VideoTransformerBase
 
16
 
17
  load_dotenv()
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  class Assistant:
20
  def __init__(self, model):
21
  self.chain = self._create_inference_chain(model)
 
27
  print("Prompt:", prompt)
28
 
29
  response = self.chain.invoke(
30
+ {"prompt": prompt, "image_base64": image},
31
  config={"configurable": {"session_id": "unused"}},
32
  ).strip()
33
 
 
37
  self._tts(response)
38
 
39
  def _tts(self, response):
40
+ # Simple TTS simulation: Print to the console.
41
+ print(f"TTS Response: {response}")
42
 
43
  def _create_inference_chain(self, model):
44
  SYSTEM_PROMPT = """
 
56
  [
57
  SystemMessage(content=SYSTEM_PROMPT),
58
  MessagesPlaceholder(variable_name="chat_history"),
59
+ ("human", [
60
+ {"type": "text", "text": "{prompt}"},
61
+ {"type": "image_url", "image_url": "data:image/jpeg;base64,{image_base64}"}
62
+ ]),
 
 
 
 
 
 
63
  ]
64
  )
65
 
 
73
  history_messages_key="chat_history",
74
  )
75
 
 
76
  def main():
77
  st.title("AI Assistant with Webcam Stream")
78
 
 
 
 
79
  # model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
80
  # You can use OpenAI's GPT-4o model instead of Gemini Flash by uncommenting the following line:
81
  model = ChatOpenAI(model="gpt-4o")
82
  assistant = Assistant(model)
83
 
84
+ class VideoProcessor(VideoTransformerBase):
85
+ def __init__(self):
86
+ self.lock = Lock()
87
+ self.frame = None
88
 
89
+ def transform(self, frame):
90
+ with self.lock:
91
+ self.frame = frame.to_ndarray(format="bgr24")
 
 
92
 
93
+ return frame
 
 
94
 
95
+ def get_base64_image(self):
96
+ with self.lock:
97
+ if self.frame is not None:
98
+ _, buffer = cv2.imencode('.jpeg', self.frame)
99
+ return base64.b64encode(buffer).decode('utf-8')
100
+ return None
101
 
102
+ ctx = webrtc_streamer(key="example", video_processor_factory=VideoProcessor)
103
  st.subheader("Ask the Assistant")
104
 
105
  prompt = st.text_input("Enter your question:")
106
 
107
+ if ctx.video_processor:
108
+ if st.button("Submit"):
109
+ base64_image = ctx.video_processor.get_base64_image()
110
+ if prompt and base64_image:
111
+ assistant.answer(prompt, base64_image)
112
+ else:
113
+ st.warning("Please enter a prompt and ensure webcam feed is available.")
 
 
 
114
 
115
  if __name__ == "__main__":
116
  main()