reab5555 commited on
Commit
db2d41c
·
verified ·
1 Parent(s): 0d98195

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -29
app.py CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
3
  from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
4
  from langchain.llms import HuggingFacePipeline
5
  from langchain_community.document_loaders import TextLoader
 
6
  from langchain_community.vectorstores import FAISS
7
  from langchain_community.embeddings import HuggingFaceEmbeddings
8
  from langchain.chains import RetrievalQA
@@ -83,12 +84,6 @@ attachments_knowledge = load_knowledge("knowledge/bartholomew_attachments_defini
83
  bigfive_knowledge = load_knowledge("knowledge/bigfive_definitions.txt")
84
  personalities_knowledge = load_knowledge("knowledge/personalities_definitions.txt")
85
 
86
- # Create vector stores
87
- embeddings = HuggingFaceEmbeddings()
88
- attachments_db = FAISS.from_texts([attachments_knowledge], embeddings)
89
- bigfive_db = FAISS.from_texts([bigfive_knowledge], embeddings)
90
- personalities_db = FAISS.from_texts([personalities_knowledge], embeddings)
91
-
92
  # Lazy initialization for retrieval chains
93
  class LazyChains:
94
  def __init__(self, lazy_llm):
@@ -101,9 +96,9 @@ class LazyChains:
101
  def get_chains(self):
102
  if self.attachments_chain is None:
103
  llm = self.lazy_llm.get_llm()
104
- self.attachments_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=attachments_db.as_retriever())
105
- self.bigfive_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=bigfive_db.as_retriever())
106
- self.personalities_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=personalities_db.as_retriever())
107
  return self.attachments_chain, self.bigfive_chain, self.personalities_chain
108
 
109
  lazy_chains = LazyChains(lazy_llm)
@@ -117,15 +112,12 @@ def process_video(video_file):
117
  temp_video_path = "temp_video.mp4"
118
  shutil.copy2(video_file.name, temp_video_path)
119
 
120
- # Initialize progress bar
121
- progress = gr.Progress()
122
-
123
  # Display progress bar for diarization
124
- progress(0, desc="Starting Diarization...")
125
- # Process the video using the diarization script
126
- language = "en"
127
- diarization.process_video(temp_video_path, hf_token, language)
128
- progress(50, desc="Diarization Complete.")
129
 
130
  # The SRT file will be created with the same name as the video file but with .srt extension
131
  srt_path = temp_video_path.replace(".mp4", "_combined.srt")
@@ -138,17 +130,17 @@ def process_video(video_file):
138
  attachments_chain, bigfive_chain, personalities_chain = lazy_chains.get_chains()
139
 
140
  # Process with LangChain and display progress bars
141
- progress(50, desc="Processing Attachments Analysis...")
142
- attachments_result = attachments_chain.run(srt_content)
143
- progress(70, desc="Attachments Analysis Complete.")
144
 
145
- progress(70, desc="Processing Big Five Analysis...")
146
- bigfive_result = bigfive_chain.run(srt_content)
147
- progress(90, desc="Big Five Analysis Complete.")
148
 
149
- progress(90, desc="Processing Personalities Analysis...")
150
- personalities_result = personalities_chain.run(srt_content)
151
- progress(100, desc="Personalities Analysis Complete.")
152
 
153
  # Combine results
154
  final_result = f"Attachments Analysis:\n{attachments_result}\n\nBig Five Analysis:\n{bigfive_result}\n\nPersonalities Analysis:\n{personalities_result}"
@@ -156,7 +148,7 @@ def process_video(video_file):
156
  end_time = time.time()
157
  execution_time = end_time - start_time
158
 
159
- # Only return execution time and final result
160
  final_result_with_time = f"Execution Time: {execution_time:.2f} seconds\n\n{final_result}"
161
 
162
  return final_result_with_time
@@ -165,10 +157,177 @@ def process_video(video_file):
165
  iface = gr.Interface(
166
  fn=process_video,
167
  inputs=gr.File(label="Upload Video File"),
168
- outputs=gr.Textbox(label="Results"),
169
  title="Video Analysis with Meta-Llama-3.1-8B-Instruct",
170
  description="Upload a video file to analyze using RAG techniques with Meta-Llama-3.1-8B-Instruct."
171
  )
172
 
173
  # Launch the app
174
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
4
  from langchain.llms import HuggingFacePipeline
5
  from langchain_community.document_loaders import TextLoader
6
+ from langchain.text_splitter import CharacterTextSplitter
7
  from langchain_community.vectorstores import FAISS
8
  from langchain_community.embeddings import HuggingFaceEmbeddings
9
  from langchain.chains import RetrievalQA
 
84
  bigfive_knowledge = load_knowledge("knowledge/bigfive_definitions.txt")
85
  personalities_knowledge = load_knowledge("knowledge/personalities_definitions.txt")
86
 
 
 
 
 
 
 
87
  # Lazy initialization for retrieval chains
88
  class LazyChains:
89
  def __init__(self, lazy_llm):
 
96
  def get_chains(self):
97
  if self.attachments_chain is None:
98
  llm = self.lazy_llm.get_llm()
99
+ self.attachments_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=attachments_knowledge)
100
+ self.bigfive_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=bigfive_knowledge)
101
+ self.personalities_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=personalities_knowledge)
102
  return self.attachments_chain, self.bigfive_chain, self.personalities_chain
103
 
104
  lazy_chains = LazyChains(lazy_llm)
 
112
  temp_video_path = "temp_video.mp4"
113
  shutil.copy2(video_file.name, temp_video_path)
114
 
 
 
 
115
  # Display progress bar for diarization
116
+ with gr.Progress(0, 100, "Processing Diarization...") as progress_diarization:
117
+ # Process the video using the diarization script
118
+ language = "en"
119
+ diarization.process_video(temp_video_path, hf_token, language)
120
+ progress_diarization.update(100)
121
 
122
  # The SRT file will be created with the same name as the video file but with .srt extension
123
  srt_path = temp_video_path.replace(".mp4", "_combined.srt")
 
130
  attachments_chain, bigfive_chain, personalities_chain = lazy_chains.get_chains()
131
 
132
  # Process with LangChain and display progress bars
133
+ with gr.Progress(0, 100, "Processing Attachments Analysis...") as progress_attachments:
134
+ attachments_result = attachments_chain.run(srt_content)
135
+ progress_attachments.update(100)
136
 
137
+ with gr.Progress(0, 100, "Processing Big Five Analysis...") as progress_bigfive:
138
+ bigfive_result = bigfive_chain.run(srt_content)
139
+ progress_bigfive.update(100)
140
 
141
+ with gr.Progress(0, 100, "Processing Personalities Analysis...") as progress_personalities:
142
+ personalities_result = personalities_chain.run(srt_content)
143
+ progress_personalities.update(100)
144
 
145
  # Combine results
146
  final_result = f"Attachments Analysis:\n{attachments_result}\n\nBig Five Analysis:\n{bigfive_result}\n\nPersonalities Analysis:\n{personalities_result}"
 
148
  end_time = time.time()
149
  execution_time = end_time - start_time
150
 
151
+ # Prepend execution time to final result
152
  final_result_with_time = f"Execution Time: {execution_time:.2f} seconds\n\n{final_result}"
153
 
154
  return final_result_with_time
 
157
  iface = gr.Interface(
158
  fn=process_video,
159
  inputs=gr.File(label="Upload Video File"),
160
+ outputs=gr.Textbox(label="Analysis Result"),
161
  title="Video Analysis with Meta-Llama-3.1-8B-Instruct",
162
  description="Upload a video file to analyze using RAG techniques with Meta-Llama-3.1-8B-Instruct."
163
  )
164
 
165
  # Launch the app
166
+ iface.launch()
167
+
168
+ # Diarization script
169
+ import os
170
+ import torch
171
+ import math
172
+ from moviepy.editor import VideoFileClip, AudioFileClip
173
+ from pyannote.audio import Pipeline
174
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
175
+ import librosa
176
+ import datetime
177
+ from collections import defaultdict
178
+ import numpy as np
179
+ import spaces
180
+
181
+ class LazyDiarizationPipeline:
182
+ def __init__(self):
183
+ self.pipeline = None
184
+
185
+ @spaces.GPU(duration=120)
186
+ def get_pipeline(self, diarization_access_token):
187
+ if self.pipeline is None:
188
+ self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=diarization_access_token)
189
+ self.pipeline = self.pipeline.to(torch.device("cuda"))
190
+ return self.pipeline
191
+
192
+ lazy_diarization_pipeline = LazyDiarizationPipeline()
193
+
194
+ class LazyTranscriptionPipeline:
195
+ def __init__(self):
196
+ self.model = None
197
+ self.processor = None
198
+ self.pipe = None
199
+
200
+ @spaces.GPU(duration=120)
201
+ def get_pipeline(self, language):
202
+ if self.pipe is None:
203
+ model_id = "openai/whisper-large-v3"
204
+ self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
205
+ model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True
206
+ )
207
+ self.model.to(torch.device("cuda"))
208
+ self.processor = AutoProcessor.from_pretrained(model_id)
209
+ self.pipe = pipeline(
210
+ "automatic-speech-recognition",
211
+ model=self.model,
212
+ tokenizer=self.processor.tokenizer,
213
+ feature_extractor=self.processor.feature_extractor,
214
+ max_new_tokens=128,
215
+ chunk_length_s=30,
216
+ batch_size=1,
217
+ return_timestamps=True,
218
+ torch_dtype=torch.float16,
219
+ device=torch.device("cuda"),
220
+ generate_kwargs={"language": language}
221
+ )
222
+ return self.pipe
223
+
224
+ lazy_transcription_pipeline = LazyTranscriptionPipeline()
225
+
226
+ def extract_audio(video_path, audio_path):
227
+ video = VideoFileClip(video_path)
228
+ audio = video.audio
229
+ audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000)
230
+
231
+ def format_timestamp(seconds):
232
+ return str(datetime.timedelta(seconds=seconds)).split('.')[0]
233
+
234
+ @spaces.GPU(duration=100)
235
+ def transcribe_audio(audio_path, language):
236
+ pipe = lazy_transcription_pipeline.get_pipeline(language)
237
+
238
+ audio, sr = librosa.load(audio_path, sr=16000)
239
+ duration = len(audio) / sr
240
+ n_chunks = math.ceil(duration / 30)
241
+ transcription_txt = ""
242
+ transcription_chunks = []
243
+
244
+ for i in range(n_chunks):
245
+ start = i * 30 * sr
246
+ end = min((i + 1) * 30 * sr, len(audio))
247
+ audio_chunk = audio[start:end]
248
+
249
+ # Convert the audio chunk to float32 numpy array
250
+ audio_chunk = (audio_chunk * 32767).astype(np.float32)
251
+
252
+ result = pipe(audio_chunk)
253
+ transcription_txt += result["text"]
254
+ for chunk in result["chunks"]:
255
+ start_time, end_time = chunk["timestamp"]
256
+ transcription_chunks.append({
257
+ "start": start_time + i * 30,
258
+ "end": end_time + i * 30,
259
+ "text": chunk["text"]
260
+ })
261
+
262
+ print(f"Transcription Progress: {int(((i + 1) / n_chunks) * 100)}%")
263
+
264
+ return transcription_txt, transcription_chunks
265
+
266
+ def create_combined_srt(transcription_chunks, diarization, output_path):
267
+ speaker_segments = []
268
+ speaker_map = {}
269
+ current_speaker_num = 1
270
+
271
+ for segment, _, speaker in diarization.itertracks(yield_label=True):
272
+ if speaker not in speaker_map:
273
+ speaker_map[speaker] = f"Speaker {current_speaker_num}"
274
+ current_speaker_num += 1
275
+ speaker_segments.append((segment.start, segment.end, speaker_map[speaker]))
276
+
277
+ with open(output_path, 'w', encoding='utf-8') as srt_file:
278
+ for i, chunk in enumerate(transcription_chunks, 1):
279
+ start_time, end_time = chunk["start"], chunk["end"]
280
+ text = chunk["text"]
281
+
282
+ # Find the corresponding speaker
283
+ current_speaker = "Unknown"
284
+ for seg_start, seg_end, speaker in speaker_segments:
285
+ if seg_start <= start_time < seg_end:
286
+ current_speaker = speaker
287
+ break
288
+
289
+ # Format timecodes as h:mm:ss (without leading zeros for hours)
290
+ start_str = format_timestamp(start_time).split('.')[0].lstrip('0')
291
+ end_str = format_timestamp(end_time).split('.')[0].lstrip('0')
292
+
293
+ srt_file.write(f"{i}\n")
294
+ srt_file.write(f"{{{current_speaker}}}\n time: ({start_str} --> {end_str})\n text: {text}\n\n")
295
+
296
+ # Add dominant speaker information
297
+ speaker_durations = defaultdict(float)
298
+ for seg_start, seg_end, speaker in speaker_segments:
299
+ speaker_durations[speaker] += seg_end - seg_start
300
+
301
+ dominant_speaker = max(speaker_durations, key=speaker_durations.get)
302
+ dominant_duration = speaker_durations[dominant_speaker]
303
+
304
+ with open(output_path, 'a', encoding='utf-8') as srt_file:
305
+ dominant_duration_str = format_timestamp(dominant_duration).split('.')[0].lstrip('0')
306
+ srt_file.write(f"\nMost dominant speaker: {dominant_speaker} with total duration {dominant_duration_str}\n")
307
+
308
+ @spaces.GPU(duration=100)
309
+ def process_video(video_path, diarization_access_token, language):
310
+ base_name = os.path.splitext(video_path)[0]
311
+ audio_path = f"{base_name}.wav"
312
+ extract_audio(video_path, audio_path)
313
+
314
+ # Diarization
315
+ print("Performing diarization...")
316
+ pipeline = lazy_diarization_pipeline.get_pipeline(diarization_access_token)
317
+ diarization = pipeline(audio_path)
318
+ print("Diarization complete.")
319
+
320
+ # Transcription
321
+ print("Performing transcription...")
322
+ transcription, chunks = transcribe_audio(audio_path, language)
323
+ print("Transcription complete.")
324
+
325
+ # Create combined SRT file
326
+ combined_srt_path = f"{base_name}_combined.srt"
327
+ create_combined_srt(chunks, diarization, combined_srt_path)
328
+ print(f"Combined SRT file created and saved to {combined_srt_path}")
329
+
330
+ # Clean up
331
+ os.remove(audio_path)
332
+
333
+ return combined_srt_path