Spaces:

reab5555
/

Multiple-Speakers-Personality-Analyzer

Runtime error

App Files Files Community

reab5555 commited on Aug 2, 2024

Commit

fb33e5c

verified ·

1 Parent(s): 13d3d89

Update diarization.py

Browse files

Files changed (1) hide show

diarization.py +157 -154

diarization.py CHANGED Viewed

@@ -1,165 +1,168 @@
 import os
-import torch
-import math
-from moviepy.editor import VideoFileClip, AudioFileClip
-from pyannote.audio import Pipeline
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
-import librosa
-import datetime
-from collections import defaultdict
-import numpy as np
 import spaces
-class LazyDiarizationPipeline:
     def __init__(self):
         self.pipeline = None
-    @spaces.GPU(duration=120)
-    def get_pipeline(self, diarization_access_token):
         if self.pipeline is None:
-            self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=diarization_access_token)
-            self.pipeline = self.pipeline.to(torch.device("cuda"))
         return self.pipeline
-lazy_diarization_pipeline = LazyDiarizationPipeline()
-class LazyTranscriptionPipeline:
-    def __init__(self):
-        self.model = None
-        self.processor = None
-        self.pipe = None
     @spaces.GPU(duration=120)
-    def get_pipeline(self, language):
-        if self.pipe is None:
-            model_id = "openai/whisper-large-v3"
-            self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
-                model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True
-            )
-            self.model.to(torch.device("cuda"))
-            self.processor = AutoProcessor.from_pretrained(model_id)
-            self.pipe = pipeline(
-                "automatic-speech-recognition",
-                model=self.model,
-                tokenizer=self.processor.tokenizer,
-                feature_extractor=self.processor.feature_extractor,
-                max_new_tokens=128,
-                chunk_length_s=30,
-                batch_size=1,
-                return_timestamps=True,
-                torch_dtype=torch.float16,
-                device=torch.device("cuda"),
-                generate_kwargs={"language": language}
-            )
-        return self.pipe
-lazy_transcription_pipeline = LazyTranscriptionPipeline()
-def extract_audio(video_path, audio_path):
-    video = VideoFileClip(video_path)
-    audio = video.audio
-    audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000)
-def format_timestamp(seconds):
-    return str(datetime.timedelta(seconds=seconds)).split('.')[0]
-@spaces.GPU(duration=100)
-def transcribe_audio(audio_path, language):
-    pipe = lazy_transcription_pipeline.get_pipeline(language)
-    audio, sr = librosa.load(audio_path, sr=16000)
-    duration = len(audio) / sr
-    n_chunks = math.ceil(duration / 30)
-    transcription_txt = ""
-    transcription_chunks = []
-    for i in range(n_chunks):
-        start = i * 30 * sr
-        end = min((i + 1) * 30 * sr, len(audio))
-        audio_chunk = audio[start:end]
-        # Convert the audio chunk to float32 numpy array
-        audio_chunk = (audio_chunk * 32767).astype(np.float32)
-        result = pipe(audio_chunk)
-        transcription_txt += result["text"]
-        for chunk in result["chunks"]:
-            start_time, end_time = chunk["timestamp"]
-            transcription_chunks.append({
-                "start": start_time + i * 30,
-                "end": end_time + i * 30,
-                "text": chunk["text"]
-            })
-        print(f"Transcription Progress: {int(((i + 1) / n_chunks) * 100)}%")
-    return transcription_txt, transcription_chunks
-def create_combined_srt(transcription_chunks, diarization, output_path):
-    speaker_segments = []
-    speaker_map = {}
-    current_speaker_num = 1
-    for segment, _, speaker in diarization.itertracks(yield_label=True):
-        if speaker not in speaker_map:
-            speaker_map[speaker] = f"Speaker {current_speaker_num}"
-            current_speaker_num += 1
-        speaker_segments.append((segment.start, segment.end, speaker_map[speaker]))
-    with open(output_path, 'w', encoding='utf-8') as srt_file:
-        for i, chunk in enumerate(transcription_chunks, 1):
-            start_time, end_time = chunk["start"], chunk["end"]
-            text = chunk["text"]
-            # Find the corresponding speaker
-            current_speaker = "Unknown"
-            for seg_start, seg_end, speaker in speaker_segments:
-                if seg_start <= start_time < seg_end:
-                    current_speaker = speaker
-                    break
-            # Format timecodes as h:mm:ss (without leading zeros for hours)
-            start_str = format_timestamp(start_time).split('.')[0].lstrip('0')
-            end_str = format_timestamp(end_time).split('.')[0].lstrip('0')
-            srt_file.write(f"{i}\n")
-            srt_file.write(f"{{{current_speaker}}}\n time: ({start_str} --> {end_str})\n text: {text}\n\n")
-    # Add dominant speaker information
-    speaker_durations = defaultdict(float)
-    for seg_start, seg_end, speaker in speaker_segments:
-        speaker_durations[speaker] += seg_end - seg_start
-    dominant_speaker = max(speaker_durations, key=speaker_durations.get)
-    dominant_duration = speaker_durations[dominant_speaker]
-    with open(output_path, 'a', encoding='utf-8') as srt_file:
-        dominant_duration_str = format_timestamp(dominant_duration).split('.')[0].lstrip('0')
-        srt_file.write(f"\nMost dominant speaker: {dominant_speaker} with total duration {dominant_duration_str}\n")
-@spaces.GPU(duration=100)
-def process_video(video_path, diarization_access_token, language):
-    base_name = os.path.splitext(video_path)[0]
-    audio_path = f"{base_name}.wav"
-    extract_audio(video_path, audio_path)
-    # Diarization
-    print("Performing diarization...")
-    pipeline = lazy_diarization_pipeline.get_pipeline(diarization_access_token)
-    diarization = pipeline(audio_path)
-    print("Diarization complete.")
-    # Transcription
-    print("Performing transcription...")
-    transcription, chunks = transcribe_audio(audio_path, language)
-    print("Transcription complete.")
-    # Create combined SRT file
-    combined_srt_path = f"{base_name}_combined.srt"
-    create_combined_srt(chunks, diarization, combined_srt_path)
-    print(f"Combined SRT file created and saved to {combined_srt_path}")
-    # Clean up
-    os.remove(audio_path)
-    return combined_srt_path

 import os
+import gradio as gr
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+from langchain.llms import HuggingFacePipeline
+from langchain_community.document_loaders import TextLoader
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.chains import RetrievalQA
+from huggingface_hub import login
+import diarization
+import shutil
 import spaces
+import time
+# Get Hugging Face token from Space secret
+hf_token = os.environ.get('hf_secret')
+if not hf_token:
+    raise ValueError("HF_TOKEN not found in environment variables. Please set it in the Space secrets.")
+# Login to Hugging Face
+login(token=hf_token)
+# Lazy initialization for the pipeline
+class LazyPipeline:
     def __init__(self):
         self.pipeline = None
+    @spaces.GPU(duration=250)
+    def get_pipeline(self):
         if self.pipeline is None:
+            import torch
+            model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16,
+                device_map="auto",
+            )
+            self.pipeline = pipeline(
+                "text-generation",
+                model=model,
+                tokenizer=tokenizer,
+                max_new_tokens=512,
+                temperature=0.5,
+                top_p=0.95,
+                repetition_penalty=1.15
+            )
         return self.pipeline
+lazy_pipe = LazyPipeline()
+# Create a LangChain wrapper around the pipeline
+class LazyLLM:
+    def __init__(self, lazy_pipeline):
+        self.lazy_pipeline = lazy_pipeline
+        self.llm = None
+    @spaces.GPU(duration=250)
+    def get_llm(self):
+        if self.llm is None:
+            pipe = self.lazy_pipeline.get_pipeline()
+            self.llm = HuggingFacePipeline(pipeline=pipe)
+        return self.llm
+lazy_llm = LazyLLM(lazy_pipe)
+# Load instruction files
+def load_instructions(file_path):
+    with open(file_path, 'r') as file:
+        return file.read()
+general_task = load_instructions("tasks/general_task.txt")
+attachments_task = load_instructions("tasks/Attachments_task.txt")
+bigfive_task = load_instructions("tasks/BigFive_task.txt")
+personalities_task = load_instructions("tasks/Personalities_task.txt")
+# Load knowledge files
+def load_knowledge(file_path):
+    with open(file_path, 'r') as file:
+        return file.read()
+attachments_knowledge = load_knowledge("knowledge/bartholomew_attachments_definitions.txt")
+bigfive_knowledge = load_knowledge("knowledge/bigfive_definitions.txt")
+personalities_knowledge = load_knowledge("knowledge/personalities_definitions.txt")
+# Lazy initialization for retrieval chains
+class LazyChains:
+    def __init__(self, lazy_llm):
+        self.lazy_llm = lazy_llm
+        self.attachments_chain = None
+        self.bigfive_chain = None
+        self.personalities_chain = None
     @spaces.GPU(duration=120)
+    def get_chains(self):
+        if self.attachments_chain is None:
+            llm = self.lazy_llm.get_llm()
+            self.attachments_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=attachments_knowledge)
+            self.bigfive_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=bigfive_knowledge)
+            self.personalities_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=personalities_knowledge)
+        return self.attachments_chain, self.bigfive_chain, self.personalities_chain
+lazy_chains = LazyChains(lazy_llm)
+# Function to process video file
+@spaces.GPU(duration=120)
+def process_video(video_file):
+    start_time = time.time()
+    # Copy the uploaded video file to a temporary location
+    temp_video_path = "temp_video.mp4"
+    shutil.copy2(video_file.name, temp_video_path)
+    # Initialize progress bar
+    progress = gr.Progress()
+    # Display progress bar for diarization
+    progress(0, desc="Starting Diarization...")
+    # Process the video using the diarization script
+    language = "en"
+    diarization.process_video(temp_video_path, hf_token, language)
+    progress(50, desc="Diarization Complete.")
+    # The SRT file will be created with the same name as the video file but with .srt extension
+    srt_path = temp_video_path.replace(".mp4", "_combined.srt")
+    # Read the content of the SRT file
+    with open(srt_path, 'r', encoding='utf-8') as file:
+        srt_content = file.read()
+    # Get the chains
+    attachments_chain, bigfive_chain, personalities_chain = lazy_chains.get_chains()
+    # Process with LangChain and display progress bars
+    progress(50, desc="Processing Attachments Analysis...")
+    attachments_result = attachments_chain.run(srt_content)
+    progress(70, desc="Attachments Analysis Complete.")
+    progress(70, desc="Processing Big Five Analysis...")
+    bigfive_result = bigfive_chain.run(srt_content)
+    progress(90, desc="Big Five Analysis Complete.")
+    progress(90, desc="Processing Personalities Analysis...")
+    personalities_result = personalities_chain.run(srt_content)
+    progress(100, desc="Personalities Analysis Complete.")
+    # Combine results
+    final_result = f"Attachments Analysis:\n{attachments_result}\n\nBig Five Analysis:\n{bigfive_result}\n\nPersonalities Analysis:\n{personalities_result}"
+    end_time = time.time()
+    execution_time = end_time - start_time
+    # Only return execution time and final result
+    final_result_with_time = f"Execution Time: {execution_time:.2f} seconds\n\n{final_result}"
+    return final_result_with_time
+# Create Gradio interface
+iface = gr.Interface(
+    fn=process_video,
+    inputs=gr.File(label="Upload Video File"),
+    outputs=gr.Textbox(label="Analysis Result"),
+    title="Video Analysis with Meta-Llama-3.1-8B-Instruct",
+    description="Upload a video file to analyze using RAG techniques with Meta-Llama-3.1-8B-Instruct."
+)
+# Launch the app
+iface.launch()