Spaces:

reab5555
/

Multiple-Speakers-Personality-Analyzer

Runtime error

App Files Files Community

reab5555 commited on Aug 2, 2024

Commit

db2d41c

verified ·

1 Parent(s): 0d98195

Update app.py

Browse files

Files changed (1) hide show

app.py +188 -29

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import gradio as gr
 from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 from langchain.llms import HuggingFacePipeline
 from langchain_community.document_loaders import TextLoader
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.chains import RetrievalQA
@@ -83,12 +84,6 @@ attachments_knowledge = load_knowledge("knowledge/bartholomew_attachments_defini
 bigfive_knowledge = load_knowledge("knowledge/bigfive_definitions.txt")
 personalities_knowledge = load_knowledge("knowledge/personalities_definitions.txt")
-# Create vector stores
-embeddings = HuggingFaceEmbeddings()
-attachments_db = FAISS.from_texts([attachments_knowledge], embeddings)
-bigfive_db = FAISS.from_texts([bigfive_knowledge], embeddings)
-personalities_db = FAISS.from_texts([personalities_knowledge], embeddings)
 # Lazy initialization for retrieval chains
 class LazyChains:
     def __init__(self, lazy_llm):
@@ -101,9 +96,9 @@ class LazyChains:
     def get_chains(self):
         if self.attachments_chain is None:
             llm = self.lazy_llm.get_llm()
-            self.attachments_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=attachments_db.as_retriever())
-            self.bigfive_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=bigfive_db.as_retriever())
-            self.personalities_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=personalities_db.as_retriever())
         return self.attachments_chain, self.bigfive_chain, self.personalities_chain
 lazy_chains = LazyChains(lazy_llm)
@@ -117,15 +112,12 @@ def process_video(video_file):
     temp_video_path = "temp_video.mp4"
     shutil.copy2(video_file.name, temp_video_path)
-    # Initialize progress bar
-    progress = gr.Progress()
     # Display progress bar for diarization
-    progress(0, desc="Starting Diarization...")
-    # Process the video using the diarization script
-    language = "en"
-    diarization.process_video(temp_video_path, hf_token, language)
-    progress(50, desc="Diarization Complete.")
     # The SRT file will be created with the same name as the video file but with .srt extension
     srt_path = temp_video_path.replace(".mp4", "_combined.srt")
@@ -138,17 +130,17 @@ def process_video(video_file):
     attachments_chain, bigfive_chain, personalities_chain = lazy_chains.get_chains()
     # Process with LangChain and display progress bars
-    progress(50, desc="Processing Attachments Analysis...")
-    attachments_result = attachments_chain.run(srt_content)
-    progress(70, desc="Attachments Analysis Complete.")
-    progress(70, desc="Processing Big Five Analysis...")
-    bigfive_result = bigfive_chain.run(srt_content)
-    progress(90, desc="Big Five Analysis Complete.")
-    progress(90, desc="Processing Personalities Analysis...")
-    personalities_result = personalities_chain.run(srt_content)
-    progress(100, desc="Personalities Analysis Complete.")
     # Combine results
     final_result = f"Attachments Analysis:\n{attachments_result}\n\nBig Five Analysis:\n{bigfive_result}\n\nPersonalities Analysis:\n{personalities_result}"
@@ -156,7 +148,7 @@ def process_video(video_file):
     end_time = time.time()
     execution_time = end_time - start_time
-    # Only return execution time and final result
     final_result_with_time = f"Execution Time: {execution_time:.2f} seconds\n\n{final_result}"
     return final_result_with_time
@@ -165,10 +157,177 @@ def process_video(video_file):
 iface = gr.Interface(
     fn=process_video,
     inputs=gr.File(label="Upload Video File"),
-    outputs=gr.Textbox(label="Results"),
     title="Video Analysis with Meta-Llama-3.1-8B-Instruct",
     description="Upload a video file to analyze using RAG techniques with Meta-Llama-3.1-8B-Instruct."
 )
 # Launch the app
-iface.launch()

 from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 from langchain.llms import HuggingFacePipeline
 from langchain_community.document_loaders import TextLoader
+from langchain.text_splitter import CharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.chains import RetrievalQA
 bigfive_knowledge = load_knowledge("knowledge/bigfive_definitions.txt")
 personalities_knowledge = load_knowledge("knowledge/personalities_definitions.txt")
 # Lazy initialization for retrieval chains
 class LazyChains:
     def __init__(self, lazy_llm):
     def get_chains(self):
         if self.attachments_chain is None:
             llm = self.lazy_llm.get_llm()
+            self.attachments_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=attachments_knowledge)
+            self.bigfive_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=bigfive_knowledge)
+            self.personalities_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=personalities_knowledge)
         return self.attachments_chain, self.bigfive_chain, self.personalities_chain
 lazy_chains = LazyChains(lazy_llm)
     temp_video_path = "temp_video.mp4"
     shutil.copy2(video_file.name, temp_video_path)
     # Display progress bar for diarization
+    with gr.Progress(0, 100, "Processing Diarization...") as progress_diarization:
+        # Process the video using the diarization script
+        language = "en"
+        diarization.process_video(temp_video_path, hf_token, language)
+        progress_diarization.update(100)
     # The SRT file will be created with the same name as the video file but with .srt extension
     srt_path = temp_video_path.replace(".mp4", "_combined.srt")
     attachments_chain, bigfive_chain, personalities_chain = lazy_chains.get_chains()
     # Process with LangChain and display progress bars
+    with gr.Progress(0, 100, "Processing Attachments Analysis...") as progress_attachments:
+        attachments_result = attachments_chain.run(srt_content)
+        progress_attachments.update(100)
+    with gr.Progress(0, 100, "Processing Big Five Analysis...") as progress_bigfive:
+        bigfive_result = bigfive_chain.run(srt_content)
+        progress_bigfive.update(100)
+    with gr.Progress(0, 100, "Processing Personalities Analysis...") as progress_personalities:
+        personalities_result = personalities_chain.run(srt_content)
+        progress_personalities.update(100)
     # Combine results
     final_result = f"Attachments Analysis:\n{attachments_result}\n\nBig Five Analysis:\n{bigfive_result}\n\nPersonalities Analysis:\n{personalities_result}"
     end_time = time.time()
     execution_time = end_time - start_time
+    # Prepend execution time to final result
     final_result_with_time = f"Execution Time: {execution_time:.2f} seconds\n\n{final_result}"
     return final_result_with_time
 iface = gr.Interface(
     fn=process_video,
     inputs=gr.File(label="Upload Video File"),
+    outputs=gr.Textbox(label="Analysis Result"),
     title="Video Analysis with Meta-Llama-3.1-8B-Instruct",
     description="Upload a video file to analyze using RAG techniques with Meta-Llama-3.1-8B-Instruct."
 )
 # Launch the app
+iface.launch()
+# Diarization script
+import os
+import torch
+import math
+from moviepy.editor import VideoFileClip, AudioFileClip
+from pyannote.audio import Pipeline
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+import librosa
+import datetime
+from collections import defaultdict
+import numpy as np
+import spaces
+class LazyDiarizationPipeline:
+    def __init__(self):
+        self.pipeline = None
+    @spaces.GPU(duration=120)
+    def get_pipeline(self, diarization_access_token):
+        if self.pipeline is None:
+            self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=diarization_access_token)
+            self.pipeline = self.pipeline.to(torch.device("cuda"))
+        return self.pipeline
+lazy_diarization_pipeline = LazyDiarizationPipeline()
+class LazyTranscriptionPipeline:
+    def __init__(self):
+        self.model = None
+        self.processor = None
+        self.pipe = None
+    @spaces.GPU(duration=120)
+    def get_pipeline(self, language):
+        if self.pipe is None:
+            model_id = "openai/whisper-large-v3"
+            self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True
+            )
+            self.model.to(torch.device("cuda"))
+            self.processor = AutoProcessor.from_pretrained(model_id)
+            self.pipe = pipeline(
+                "automatic-speech-recognition",
+                model=self.model,
+                tokenizer=self.processor.tokenizer,
+                feature_extractor=self.processor.feature_extractor,
+                max_new_tokens=128,
+                chunk_length_s=30,
+                batch_size=1,
+                return_timestamps=True,
+                torch_dtype=torch.float16,
+                device=torch.device("cuda"),
+                generate_kwargs={"language": language}
+            )
+        return self.pipe
+lazy_transcription_pipeline = LazyTranscriptionPipeline()
+def extract_audio(video_path, audio_path):
+    video = VideoFileClip(video_path)
+    audio = video.audio
+    audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000)
+def format_timestamp(seconds):
+    return str(datetime.timedelta(seconds=seconds)).split('.')[0]
+@spaces.GPU(duration=100)
+def transcribe_audio(audio_path, language):
+    pipe = lazy_transcription_pipeline.get_pipeline(language)
+    audio, sr = librosa.load(audio_path, sr=16000)
+    duration = len(audio) / sr
+    n_chunks = math.ceil(duration / 30)
+    transcription_txt = ""
+    transcription_chunks = []
+    for i in range(n_chunks):
+        start = i * 30 * sr
+        end = min((i + 1) * 30 * sr, len(audio))
+        audio_chunk = audio[start:end]
+        # Convert the audio chunk to float32 numpy array
+        audio_chunk = (audio_chunk * 32767).astype(np.float32)
+        result = pipe(audio_chunk)
+        transcription_txt += result["text"]
+        for chunk in result["chunks"]:
+            start_time, end_time = chunk["timestamp"]
+            transcription_chunks.append({
+                "start": start_time + i * 30,
+                "end": end_time + i * 30,
+                "text": chunk["text"]
+            })
+        print(f"Transcription Progress: {int(((i + 1) / n_chunks) * 100)}%")
+    return transcription_txt, transcription_chunks
+def create_combined_srt(transcription_chunks, diarization, output_path):
+    speaker_segments = []
+    speaker_map = {}
+    current_speaker_num = 1
+    for segment, _, speaker in diarization.itertracks(yield_label=True):
+        if speaker not in speaker_map:
+            speaker_map[speaker] = f"Speaker {current_speaker_num}"
+            current_speaker_num += 1
+        speaker_segments.append((segment.start, segment.end, speaker_map[speaker]))
+    with open(output_path, 'w', encoding='utf-8') as srt_file:
+        for i, chunk in enumerate(transcription_chunks, 1):
+            start_time, end_time = chunk["start"], chunk["end"]
+            text = chunk["text"]
+            # Find the corresponding speaker
+            current_speaker = "Unknown"
+            for seg_start, seg_end, speaker in speaker_segments:
+                if seg_start <= start_time < seg_end:
+                    current_speaker = speaker
+                    break
+            # Format timecodes as h:mm:ss (without leading zeros for hours)
+            start_str = format_timestamp(start_time).split('.')[0].lstrip('0')
+            end_str = format_timestamp(end_time).split('.')[0].lstrip('0')
+            srt_file.write(f"{i}\n")
+            srt_file.write(f"{{{current_speaker}}}\n time: ({start_str} --> {end_str})\n text: {text}\n\n")
+    # Add dominant speaker information
+    speaker_durations = defaultdict(float)
+    for seg_start, seg_end, speaker in speaker_segments:
+        speaker_durations[speaker] += seg_end - seg_start
+    dominant_speaker = max(speaker_durations, key=speaker_durations.get)
+    dominant_duration = speaker_durations[dominant_speaker]
+    with open(output_path, 'a', encoding='utf-8') as srt_file:
+        dominant_duration_str = format_timestamp(dominant_duration).split('.')[0].lstrip('0')
+        srt_file.write(f"\nMost dominant speaker: {dominant_speaker} with total duration {dominant_duration_str}\n")
+@spaces.GPU(duration=100)
+def process_video(video_path, diarization_access_token, language):
+    base_name = os.path.splitext(video_path)[0]
+    audio_path = f"{base_name}.wav"
+    extract_audio(video_path, audio_path)
+    # Diarization
+    print("Performing diarization...")
+    pipeline = lazy_diarization_pipeline.get_pipeline(diarization_access_token)
+    diarization = pipeline(audio_path)
+    print("Diarization complete.")
+    # Transcription
+    print("Performing transcription...")
+    transcription, chunks = transcribe_audio(audio_path, language)
+    print("Transcription complete.")
+    # Create combined SRT file
+    combined_srt_path = f"{base_name}_combined.srt"
+    create_combined_srt(chunks, diarization, combined_srt_path)
+    print(f"Combined SRT file created and saved to {combined_srt_path}")
+    # Clean up
+    os.remove(audio_path)
+    return combined_srt_path