Spaces:

reab5555
/

Multiple-Speakers-Personality-Analyzer

Runtime error

App Files Files Community

reab5555 commited on Aug 9, 2024

Commit

e3551a8

verified ·

1 Parent(s): 99931eb

Upload 8 files

Browse files

Files changed (8) hide show

app.py +116 -89
config.py +8 -0
llm_loader.py +12 -0
output_parser.py +4 -66
processing.py +57 -223
requirements.txt +6 -5
transcription_diarization.py +52 -42
visualization.py +67 -96

app.py CHANGED Viewed

@@ -1,92 +1,119 @@
 import gradio as gr
 from processing import process_input
-from visualization import update_visibility_and_charts
-from output_parser import parse_analysis_output
-def process_and_update(file, max_speakers, progress=gr.Progress()):
-    status, exec_time, lang, attachments, bigfive, personalities, transcription = process_input(file, max_speakers, progress)
-    # Get data for charts
-    chart_data = transcription
-    # Parse analysis outputs
-    attachments_data = parse_analysis_output(attachments)
-    bigfive_data = parse_analysis_output(bigfive)
-    personalities_data = parse_analysis_output(personalities)
-    # Print debug information
-    print_debug_info(status, exec_time, lang, transcription, attachments_data, bigfive_data, personalities_data, chart_data)
-    # Update visibility and charts
-    return update_visibility_and_charts(status, exec_time, lang, transcription, attachments_data, bigfive_data, personalities_data, chart_data, max_speakers)
-def create_gradio_interface():
-    with gr.Blocks() as demo:
-        gr.Markdown("# Conversation Analysis")
-        with gr.Row():
-            input_file = gr.File(label="Upload File")
-            max_speakers = gr.Slider(minimum=1, maximum=2, value=2, step=1, label="Maximum Number of Speakers")
-            submit_button = gr.Button("Analyze")
-        status_output = gr.Textbox(label="Status")
-        exec_time_output = gr.Textbox(label="Execution Time")
-        lang_output = gr.Textbox(label="Detected Language")
-        transcription_output = gr.Textbox(label="Transcription", lines=10)
-        speaker_outputs = []
-        for speaker in range(2):  # Limit to 2 speakers
-            with gr.Accordion(f"Speaker {speaker + 1}", open=False):
-                # Attachments
-                with gr.Group():
-                    gr.Markdown("### Attachments Analysis")
-                    speaker_outputs.extend([
-                        gr.Plot(visible=False, label="Attachments Chart"),
-                        gr.Plot(visible=False, label="Anxiety-Avoidance-Self-Others Chart"),
-                        gr.Textbox(visible=False, label="Attachments Explanation")
-                    ])
-                # Big Five
-                with gr.Group():
-                    gr.Markdown("### Big Five Analysis")
-                    speaker_outputs.extend([
-                        gr.Plot(visible=False, label="Big Five Chart"),
-                        gr.Textbox(visible=False, label="Big Five Explanation")
-                    ])
-                # Personalities
-                with gr.Group():
-                    gr.Markdown("### Personalities Analysis")
-                    speaker_outputs.extend([
-                        gr.Plot(visible=False, label="Personalities Chart"),
-                        gr.Textbox(visible=False, label="Personalities Explanation")
-                    ])
-        submit_button.click(
-            process_and_update,
-            inputs=[input_file, max_speakers],
-            outputs=[status_output, exec_time_output, lang_output, transcription_output] + speaker_outputs
-        )
-    return demo
-def print_debug_info(status, exec_time, lang, transcription, attachments_data, bigfive_data, personalities_data, chart_data):
-    print("Debug Information:")
-    print(f"Status: {status}")
-    print(f"Execution Time: {exec_time}")
-    print(f"Detected Language: {lang}")
-    print(f"Transcription (first 200 characters): {transcription[:200]}...")
-    print("\nAttachments Analysis:")
-    print(attachments_data)
-    print("\nBig Five Analysis:")
-    print(bigfive_data)
-    print("\nPersonalities Analysis:")
-    print(personalities_data)
-    print("\nChart Data:")
-    print(chart_data)
-# The main execution of the script
 if __name__ == "__main__":
-    demo = create_gradio_interface()
-    demo.launch()

 import gradio as gr
+from llm_loader import load_model
 from processing import process_input
+from transcription_diarization import process_video
+from visualization import create_charts
+import os
+import time
+from config import hf_token, openai_api_key
+# Load the model
+llm = load_model(openai_api_key)
+# Mapping of display names to language codes
+LANGUAGE_MAP = {
+    "English": "en",
+    "Hebrew": "he",
+    "Italian": "it",
+    "French": "fr",
+    "German": "de",
+    "Chinese": "zh",
+    "Arabic": "ar"
+}
+def analyze_video(video_path, language_display_name, max_speakers, progress=gr.Progress()):
+    start_time = time.time()
+    if not video_path:
+        return "Please upload a video file.", gr.Textbox.update(value="Analysis not started.")
+    # Convert the display name to the language code
+    language = LANGUAGE_MAP[language_display_name]
+    # Start the progress bar
+    progress(0, desc="Starting analysis...")
+    # Progress for diarization
+    progress(0.2, desc="Starting diarization...")
+    srt_path = process_video(video_path, hf_token, language, max_speakers)
+    progress(0.4, desc="Diarization complete.")
+    # Progress for transcription
+    with open(srt_path, 'r', encoding='utf-8') as file:
+        transcription = file.read()
+    progress(0.6, desc="Transcription complete.")
+    # Progress for processing the transcription
+    progress(0.7, desc="Processing transcription...")
+    results = process_input(transcription, llm)
+    progress(0.8, desc="Transcription processing complete.")
+    # Progress for creating charts
+    progress(0.9, desc="Generating charts...")
+    charts, explanations = create_charts(results)
+    progress(1.0, desc="Charts generation complete.")
+    # Clean up the temporary SRT file
+    os.remove(srt_path)
+    end_time = time.time()
+    execution_time = end_time - start_time
+    # Prepare outputs for each speaker
+    output_components = []
+    for speaker_id, speaker_charts in charts.items():
+        speaker_explanations = explanations[speaker_id]
+        output_components.extend([
+            gr.Markdown(f"### Speaker {speaker_id}"),
+            gr.Plot(speaker_charts["attachment"]),
+            gr.Textbox(value=speaker_explanations["attachment"],
+                       label=f"Attachment Styles Explanation - Speaker {speaker_id}", lines=2),
+            gr.Plot(speaker_charts["dimensions"]),
+            gr.Plot(speaker_charts["bigfive"]),
+            gr.Textbox(value=speaker_explanations["bigfive"],
+                       label=f"Big Five Traits Explanation - Speaker {speaker_id}", lines=2),
+            gr.Plot(speaker_charts["personality"]),
+            gr.Textbox(value=speaker_explanations["personality"],
+                       label=f"Personality Disorders Explanation - Speaker {speaker_id}", lines=2)
+        ])
+    # Add the transcript and execution info at the end
+    output_components.extend([
+        gr.Textbox(value=transcription, label="Transcript", lines=10),
+        gr.Textbox.update(value=f"Completed in {int(execution_time)} seconds.", label="Execution Information",
+                          visible=True)
+    ])
+    return output_components, gr.Textbox.update(value=f"Completed in {int(execution_time)} seconds.")
+# Define the Gradio interface
+with gr.Blocks() as iface:
+    gr.Markdown("# Video Analysis Tool")
+    gr.Markdown("Upload a video to analyze speech patterns and personality traits.")
+    video_input = gr.Video(label="Upload Video")
+    language_input = gr.Dropdown(choices=list(LANGUAGE_MAP.keys()), value="English", label="Select Language")
+    max_speakers = gr.Slider(minimum=1, maximum=4, step=1, value=2, label="Maximum Number of Speakers")
+    analyze_button = gr.Button("Analyze")
+    # Placeholder for dynamic outputs
+    output_section = gr.Column()
+    # Execution time box, initially displaying a waiting message
+    execution_info_box = gr.Textbox(label="Execution Information", value="Waiting for analysis...", lines=2,
+                                    visible=True)
+    analyze_button.click(
+        fn=analyze_video,
+        inputs=[video_input, language_input, max_speakers],
+        outputs=[output_section, execution_info_box],
+        show_progress=True  # Enables the progress bar in Gradio
+    )
+# Launch the app
 if __name__ == "__main__":
+    iface.launch()

config.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# config.py
+import os
+from dotenv import load_dotenv
+load_dotenv()  # This loads the variables from .env file
+openai_api_key = os.getenv('OPENAI_API_KEY')
+hf_token = os.getenv('hf_token')

llm_loader.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# llm_loader.py
+from langchain.chat_models import ChatOpenAI
+def load_model(openai_api_key):
+    return ChatOpenAI(
+        model_name="gpt-4o",
+        openai_api_key=openai_api_key,
+        temperature=0.01,
+        max_tokens=2096,
+        top_p=0.95,
+        top_k=25
+    )

output_parser.py CHANGED Viewed

@@ -1,7 +1,5 @@
-from langchain.output_parsers import StructuredOutputParser, ResponseSchema
-from langchain.prompts import PromptTemplate
 from pydantic import BaseModel
-from typing import Dict
 class AttachmentStyle(BaseModel):
     speaker: str
@@ -37,66 +35,6 @@ class PersonalityDisorder(BaseModel):
     obsessional: int
     explanation: str
-attachment_response_schemas = [
-    ResponseSchema(name="speaker", description="The name or number of the speaker"),
-    ResponseSchema(name="secured", description="Probability of secured attachment style (0-1)"),
-    ResponseSchema(name="anxious_preoccupied", description="Probability of anxious-preoccupied attachment style (0-1)"),
-    ResponseSchema(name="dismissive_avoidant", description="Probability of dismissive-avoidant attachment style (0-1)"),
-    ResponseSchema(name="fearful_avoidant", description="Probability of fearful-avoidant attachment style (0-1)"),
-    ResponseSchema(name="self_rating", description="Self rating (0-10)"),
-    ResponseSchema(name="others_rating", description="Others rating (0-10)"),
-    ResponseSchema(name="anxiety", description="Anxiety rating (0-10)"),
-    ResponseSchema(name="avoidance", description="Avoidance rating (0-10)"),
-    ResponseSchema(name="explanation", description="Brief explanation of the attachment style")
-]
-bigfive_response_schemas = [
-    ResponseSchema(name="speaker", description="The name or number of the speaker"),
-    ResponseSchema(name="extraversion", description="Extraversion rating (-10 to 10)"),
-    ResponseSchema(name="agreeableness", description="Agreeableness rating (-10 to 10)"),
-    ResponseSchema(name="conscientiousness", description="Conscientiousness rating (-10 to 10)"),
-    ResponseSchema(name="neuroticism", description="Neuroticism rating (-10 to 10)"),
-    ResponseSchema(name="openness", description="Openness rating (-10 to 10)"),
-    ResponseSchema(name="explanation", description="Brief explanation of the Big Five traits")
-]
-personality_response_schemas = [
-    ResponseSchema(name="speaker", description="The name or number of the speaker"),
-    ResponseSchema(name="depressed", description="Depressed rating (0-4)"),
-    ResponseSchema(name="paranoid", description="Paranoid rating (0-4)"),
-    ResponseSchema(name="schizoid_schizotypal", description="Schizoid-Schizotypal rating (0-4)"),
-    ResponseSchema(name="antisocial_psychopathic", description="Antisocial-Psychopathic rating (0-4)"),
-    ResponseSchema(name="borderline_dysregulated", description="Borderline-Dysregulated rating (0-4)"),
-    ResponseSchema(name="narcissistic", description="Narcissistic rating (0-4)"),
-    ResponseSchema(name="anxious_avoidant", description="Anxious-Avoidant rating (0-4)"),
-    ResponseSchema(name="dependent_victimized", description="Dependent-Victimized rating (0-4)"),
-    ResponseSchema(name="obsessional", description="Obsessional rating (0-4)"),
-    ResponseSchema(name="explanation", description="Brief explanation of the personality disorders")
-]
-attachment_parser = StructuredOutputParser.from_response_schemas(attachment_response_schemas)
-bigfive_parser = StructuredOutputParser.from_response_schemas(bigfive_response_schemas)
-personality_parser = StructuredOutputParser.from_response_schemas(personality_response_schemas)
-def get_prompt_template(task: str, parser: StructuredOutputParser) -> PromptTemplate:
-    return PromptTemplate(
-        template="Analyze the following text according to the given task:\n\n{task}\n\n{format_instructions}\n\nText: {text}\n\nAnalysis:",
-        input_variables=["text"],
-        partial_variables={
-            "task": task,
-            "format_instructions": parser.get_format_instructions()
-        }
-    )
-def parse_analysis_output(output: str, analysis_type: str) -> Dict[str, BaseModel]:
-    if analysis_type == "attachments":
-        parsed = attachment_parser.parse(output)
-        return {parsed['speaker']: AttachmentStyle(**parsed)}
-    elif analysis_type == "bigfive":
-        parsed = bigfive_parser.parse(output)
-        return {parsed['speaker']: BigFiveTraits(**parsed)}
-    elif analysis_type == "personalities":
-        parsed = personality_parser.parse(output)
-        return {parsed['speaker']: PersonalityDisorder(**parsed)}
-    else:
-        raise ValueError(f"Unknown analysis type: {analysis_type}")

 from pydantic import BaseModel
+from langchain.output_parsers import PydanticOutputParser
 class AttachmentStyle(BaseModel):
     speaker: str
     obsessional: int
     explanation: str
+attachment_parser = PydanticOutputParser(pydantic_object=AttachmentStyle)
+bigfive_parser = PydanticOutputParser(pydantic_object=BigFiveTraits)
+personality_parser = PydanticOutputParser(pydantic_object=PersonalityDisorder)

processing.py CHANGED Viewed

@@ -1,231 +1,65 @@
-import os
-import time
-import re
-import numpy as np
-from huggingface_hub import login
-import torch
-import random
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from langdetect import detect
-from langchain.chains import RetrievalQA
-from langchain_community.llms import HuggingFacePipeline
-from langchain.prompts import PromptTemplate
-from langchain_community.document_loaders import TextLoader, PyPDFLoader
-from langchain.text_splitter import CharacterTextSplitter
-from langchain_community.vectorstores import FAISS
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from transcription_diarization import process_video
-from output_parser import get_prompt_template, attachment_parser, bigfive_parser, personality_parser, parse_analysis_output
-hf_token = os.environ.get('hf_secret')
-if not hf_token:
-    raise ValueError("HF_TOKEN not found in environment variables. Please set it in the Space secrets.")
-login(token=hf_token)
-def load_instructions(file_path):
-    with open(file_path, 'r') as file:
         return file.read().strip()
-def load_knowledge(file_path):
-    loader = TextLoader(file_path)
-    documents = loader.load()
-    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
-    texts = text_splitter.split_documents(documents)
-    return texts
-general_task = load_instructions("tasks/general_task.txt")
-attachments_task = load_instructions("tasks/Attachments_task.txt")
-bigfive_task = load_instructions("tasks/BigFive_task.txt")
-personalities_task = load_instructions("tasks/Personalities_task.txt")
-embeddings = HuggingFaceEmbeddings()
-attachments_db = FAISS.from_documents(load_knowledge("knowledge/bartholomew_attachments_definitions_no_items_no_in.txt"), embeddings)
-bigfive_db = FAISS.from_documents(load_knowledge("knowledge/bigfive_definitions_no_items.txt"), embeddings)
-personalities_db = FAISS.from_documents(load_knowledge("knowledge/personalities_definitions.txt"), embeddings)
-def detect_language(text):
     try:
-        return detect(text)
-    except:
-        return "en"
-class SequentialAnalyzer:
-    def __init__(self, hf_token, seed=42):
-        self.hf_token = hf_token
-        self.model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-        self.set_seed(seed)
-        self.model = self.load_model()
-        self.pipe = self.create_pipeline(self.model)
-    def set_seed(self, seed):
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(seed)
-    def load_model(self):
-        model = AutoModelForCausalLM.from_pretrained(
-            self.model_name,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
-            use_auth_token=self.hf_token,
-            use_cache=False,
-            load_in_4bit=False
-        )
-        return model
-    def create_pipeline(self, model):
-        from transformers import pipeline
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_auth_token=self.hf_token)
-        return pipeline(
-            "text-generation",
-            model=model,
-            top_k=50,
-            top_p=0.8,
-            tokenizer=tokenizer,
-            max_new_tokens=512,
-            temperature=0.3,
-            repetition_penalty=1.2,
-            do_sample=False,
-            truncation=True,
-            bad_words_ids=[[tokenizer.encode(char, add_special_tokens=False)[0]] for char in "*"]
-        )
-    def post_process_output(self, output):
-        return re.sub(r'[*]', '', output).strip()
-    def analyze_task(self, content, task, knowledge_db, analysis_type):
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_auth_token=self.hf_token)
-        input_tokens = len(tokenizer.encode(content))
-        max_input_length = 800
-        encoded_input = tokenizer.encode(content, truncation=True, max_length=max_input_length)
-        truncated_content = tokenizer.decode(encoded_input)
-        if len(encoded_input) == max_input_length:
-            print(f"Warning: Input was truncated from {input_tokens} to {max_input_length} tokens.")
-        llm = HuggingFacePipeline(pipeline=self.pipe)
-        if analysis_type == "attachments":
-            parser = attachment_parser
-        elif analysis_type == "bigfive":
-            parser = bigfive_parser
-        elif analysis_type == "personalities":
-            parser = personality_parser
-        else:
-            raise ValueError(f"Unknown analysis type: {analysis_type}")
-        prompt_template = PromptTemplate(
-            template=task + "\n\n{context}\n\n{query}\n\nSpeaker: {speaker}\n\n" + parser.get_format_instructions() + "\n\nAnalysis:",
-            input_variables=["context", "query", "speaker"]
-        )
-        if knowledge_db is None:
-            chain = prompt_template | llm
-            result = chain.invoke({"query": truncated_content, "speaker": "Unknown"})
-            output = result
-        else:
-            chain = RetrievalQA.from_chain_type(
-                llm=llm,
-                chain_type="stuff",
-                retriever=knowledge_db.as_retriever(),
-                chain_type_kwargs={"prompt": prompt_template}
-            )
-            result = chain({"query": truncated_content, "speaker": "Unknown"})
-            output = result['result']  # RetrievalQA returns a dict with 'result' key
-        print(f"Raw model output: {output}")
-        try:
-            cleaned_output = self.post_process_output(output)
-            parsed_output = parser.parse(cleaned_output)
-        except Exception as e:
-            raise ValueError(f"Error parsing output: {e}")
-        # Check if all required keys are present
-        required_keys = {schema.name for schema in parser.response_schemas}
-        missing_keys = required_keys - parsed_output.keys()
-        if missing_keys:
-            raise ValueError(f"Missing some input keys: {missing_keys}")
-        return cleaned_output, input_tokens
-def process_input(input_file, max_speakers, progress=None):
-    start_time = time.time()
-    def safe_progress(value, desc=""):
-        if progress is not None:
-            try:
-                progress(value, desc=desc)
-            except Exception as e:
-                print(f"Progress update failed: {e}")
-    safe_progress(0, desc="Processing file")
-    if isinstance(input_file, str):
-        file_path = input_file
-    else:
-        file_path = input_file.name
-    file_extension = os.path.splitext(file_path)[1].lower()
-    if file_extension in ['.txt', '.srt']:
-        with open(file_path, 'r', encoding='utf-8') as file:
-            content = file.read()
-        transcription = content
-    elif file_extension == '.pdf':
-        loader = PyPDFLoader(file_path)
-        pages = loader.load_and_split()
-        content = '\n'.join([page.page_content for page in pages])
-        transcription = content
-    elif file_extension in ['.mp4', '.avi', '.mov']:
-        safe_progress(0.2, desc="Processing video...")
-        srt_path = process_video(file_path, hf_token, "en", max_speakers)
-        with open(srt_path, 'r', encoding='utf-8') as file:
-            content = file.read()
-        transcription = content
-        os.remove(srt_path)
-    else:
-        return "Unsupported file format. Please upload a TXT, SRT, PDF, or video file.", None, None, None, None, None, None
-    detected_language = detect_language(content)
-    safe_progress(0.2, desc="Initializing analyzer")
-    analyzer = SequentialAnalyzer(hf_token)
     tasks = [
-        ("General + Attachments", general_task + "\n\n" + attachments_task, attachments_db, "attachments"),
-        ("General + Big Five", general_task + "\n\n" + bigfive_task, bigfive_db, "bigfive"),
-        ("General + Personalities", general_task + "\n\n" + personalities_task, personalities_db, "personalities")
     ]
-    results = []
-    tokens = []
-    for i, (task_name, task, db, analysis_type) in enumerate(tasks):
-        safe_progress((i + 1) * 0.2, desc=f"Analyzing {task_name}")
-        answer, task_tokens = analyzer.analyze_task(content, task, db, analysis_type)
-        results.append((answer, analysis_type))
-        tokens.append(task_tokens)
-    end_time = time.time()
-    execution_time = end_time - start_time
-    safe_progress(1.0, desc="Analysis complete!")
-    parsed_results = [parse_analysis_output(result, analysis_type) for result, analysis_type in results]
-    return (
-        "Analysis complete!",
-        f"{execution_time:.2f} seconds",
-        detected_language,
-        parsed_results[0],  # attachments
-        parsed_results[1],  # bigfive
-        parsed_results[2],  # personalities,
-        transcription
-    )

+# processing.py
+from langchain.schema import HumanMessage
+from output_parser import attachment_parser, bigfive_parser, personality_parser
+def load_text(file_path: str) -> str:
+    with open(file_path, 'r', encoding='utf-8') as file:
         return file.read().strip()
+def truncate_text(text: str, max_tokens: int = 10000) -> str:
+    words = text.split()
+    if len(words) > max_tokens:
+        truncated_text = ' '.join(words[:max_tokens])
+        print(f"Text truncated from {len(words)} to {max_tokens} words")
+        return truncated_text
+    print(f"Text not truncated, contains {len(words)} words")
+    return text
+def process_task(llm, input_text: str, general_task: str, specific_task: str, knowledge: str, output_parser):
+    truncated_input = truncate_text(input_text)
+    prompt = f"""{general_task}
+{specific_task}
+Knowledge: {knowledge}
+Input: {truncated_input}
+{output_parser.get_format_instructions()}
+Analysis:"""
+    messages = [HumanMessage(content=prompt)]
+    response = llm(messages)
+    print(response)
     try:
+        parsed_output = output_parser.parse(response.content)
+        return parsed_output
+    except Exception as e:
+        print(f"Error parsing output: {e}")
+        return None
+def process_input(input_text: str, llm):
+    general_task = load_text("tasks/general_task.txt")
     tasks = [
+        ("attachments", "tasks/Attachments_task.txt", "knowledge/bartholomew_attachments_definitions.txt",
+         attachment_parser),
+        ("bigfive", "tasks/BigFive_task.txt", "knowledge/bigfive_definitions.txt", bigfive_parser),
+        ("personalities", "tasks/Personalities_task.txt", "knowledge/personalities_definitions.txt", personality_parser)
     ]
+    results = {}
+    for task_name, task_file, knowledge_file, parser in tasks:
+        specific_task = load_text(task_file)
+        knowledge = load_text(knowledge_file)
+        results[task_name] = process_task(llm, input_text, general_task, specific_task, knowledge, parser)
+    return results

requirements.txt CHANGED Viewed

@@ -6,14 +6,15 @@ langchain
 langchain-community
 faiss-gpu
 bitsandbytes
-seaborn
 plotly
 sentence-transformers
-huggingface_hub
 moviepy
 pyannote.audio
 librosa
-langdetect
-opencv-python
 numpy
-accelerate

 langchain-community
 faiss-gpu
 bitsandbytes
 plotly
 sentence-transformers
 moviepy
 pyannote.audio
 librosa
+soundfile
 numpy
+accelerate

transcription_diarization.py CHANGED Viewed

@@ -6,54 +6,54 @@ from moviepy.editor import VideoFileClip
 from pyannote.audio import Pipeline
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import librosa
 import datetime
 from collections import defaultdict
 import numpy as np
-import spaces
 class LazyDiarizationPipeline:
     def __init__(self):
         self.pipeline = None
-    @spaces.GPU(duration=100)
-    def get_pipeline(self, diarization_access_token):
         if self.pipeline is None:
-            self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=diarization_access_token)
-            self.pipeline = self.pipeline.to(torch.device("cuda"))
             torch.cuda.empty_cache()
             gc.collect()
         return self.pipeline
 class LazyTranscriptionPipeline:
     def __init__(self):
         self.model = None
         self.processor = None
         self.pipe = None
-    @spaces.GPU(duration=100)
-    def get_pipeline(self, language):
         if self.pipe is None:
             model_id = "openai/whisper-large-v3"
             self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
-                model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True
             )
-            self.model.to(torch.device("cuda"))
             self.processor = AutoProcessor.from_pretrained(model_id)
             self.pipe = pipeline(
                 "automatic-speech-recognition",
                 model=self.model,
                 tokenizer=self.processor.tokenizer,
                 feature_extractor=self.processor.feature_extractor,
-                max_new_tokens=128,
                 chunk_length_s=30,
-                batch_size=8,
                 return_timestamps=True,
-                torch_dtype=torch.float16,
-                device=torch.device("cuda"),
-                generate_kwargs={"language": language}
             )
         return self.pipe
 lazy_diarization_pipeline = LazyDiarizationPipeline()
 lazy_transcription_pipeline = LazyTranscriptionPipeline()
@@ -62,12 +62,13 @@ def extract_audio(video_path, audio_path):
     audio = video.audio
     audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000)
 def format_timestamp(seconds):
     return str(datetime.timedelta(seconds=seconds)).split('.')[0]
-@spaces.GPU(duration=100)
-def transcribe_audio(audio_path, language, progress=None):
-    pipe = lazy_transcription_pipeline.get_pipeline(language)
     audio, sr = librosa.load(audio_path, sr=16000)
     duration = len(audio) / sr
@@ -81,22 +82,44 @@ def transcribe_audio(audio_path, language, progress=None):
         audio_chunk = audio[start:end]
         audio_chunk = (audio_chunk * 32767).astype(np.float32)
-        result = pipe(audio_chunk)
         transcription_txt += result["text"]
         for chunk in result["chunks"]:
             start_time, end_time = chunk["timestamp"]
             transcription_chunks.append({
                 "start": start_time + i * 30,
                 "end": end_time + i * 30,
                 "text": chunk["text"]
             })
-        if progress:
-            progress(0.6 + 0.2 * (i + 1) / n_chunks, desc=f"Transcription Progress: {int(((i + 1) / n_chunks) * 100)}%")
     return transcription_txt, transcription_chunks
-def create_combined_srt(transcription_chunks, diarization, output_path, max_speakers=3):
     speaker_segments = []
     speaker_durations = defaultdict(float)
@@ -105,7 +128,7 @@ def create_combined_srt(transcription_chunks, diarization, output_path, max_spea
         speaker_segments.append((segment.start, segment.end, speaker))
     sorted_speakers = sorted(speaker_durations.items(), key=lambda x: x[1], reverse=True)[:max_speakers]
     speaker_map = {}
     for i, (speaker, _) in enumerate(sorted_speakers, start=1):
         speaker_map[speaker] = f"Speaker {i}"
@@ -132,28 +155,20 @@ def create_combined_srt(transcription_chunks, diarization, output_path, max_spea
             duration_str = format_timestamp(duration).split('.')[0].lstrip('0')
             srt_file.write(f"Speaker {i} (originally {speaker}): total duration {duration_str}\n")
-@spaces.GPU(duration=100)
-def process_video(video_path, diarization_access_token, language, max_speakers=3, progress=None):
     base_name = os.path.splitext(video_path)[0]
     audio_path = f"{base_name}.wav"
     extract_audio(video_path, audio_path)
-    if progress:
-        progress(0.3, desc="Performing diarization...")
-    pipeline = lazy_diarization_pipeline.get_pipeline(diarization_access_token)
-    diarization = pipeline(audio_path)
-    if progress:
-        progress(0.5, desc="Diarization complete.")
     # Clear GPU memory after diarization
     torch.cuda.empty_cache()
     gc.collect()
-    if progress:
-        progress(0.6, desc="Performing transcription...")
-    transcription, chunks = transcribe_audio(audio_path, language, progress)
-    if progress:
-        progress(0.8, desc="Transcription complete.")
     # Clear GPU memory after transcription
     torch.cuda.empty_cache()
@@ -161,8 +176,6 @@ def process_video(video_path, diarization_access_token, language, max_speakers=3
     combined_srt_path = f"{base_name}_combined.srt"
     create_combined_srt(chunks, diarization, combined_srt_path, max_speakers)
-    if progress:
-        progress(0.9, desc="Combined SRT file created.")
     os.remove(audio_path)
@@ -170,7 +183,4 @@ def process_video(video_path, diarization_access_token, language, max_speakers=3
     torch.cuda.empty_cache()
     gc.collect()
-    if progress:
-        progress(1.0, desc="Video processing complete.")
-    return combined_srt_path

 from pyannote.audio import Pipeline
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import librosa
+import soundfile as sf
 import datetime
 from collections import defaultdict
 import numpy as np
 class LazyDiarizationPipeline:
     def __init__(self):
         self.pipeline = None
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def get_pipeline(self, hf_token):
         if self.pipeline is None:
+            self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1",
+                                                     use_auth_token=hf_token)
+            self.pipeline = self.pipeline.to(self.device)
             torch.cuda.empty_cache()
             gc.collect()
         return self.pipeline
 class LazyTranscriptionPipeline:
     def __init__(self):
         self.model = None
         self.processor = None
         self.pipe = None
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def get_pipeline(self):
         if self.pipe is None:
             model_id = "openai/whisper-large-v3"
+            torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
             self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
             )
+            self.model.to(self.device)
             self.processor = AutoProcessor.from_pretrained(model_id)
             self.pipe = pipeline(
                 "automatic-speech-recognition",
                 model=self.model,
                 tokenizer=self.processor.tokenizer,
                 feature_extractor=self.processor.feature_extractor,
                 chunk_length_s=30,
                 return_timestamps=True,
+                device=self.device
             )
         return self.pipe
 lazy_diarization_pipeline = LazyDiarizationPipeline()
 lazy_transcription_pipeline = LazyTranscriptionPipeline()
     audio = video.audio
     audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000)
 def format_timestamp(seconds):
     return str(datetime.timedelta(seconds=seconds)).split('.')[0]
+def transcribe_audio(audio_path, language):
+    pipe = lazy_transcription_pipeline.get_pipeline()
     audio, sr = librosa.load(audio_path, sr=16000)
     duration = len(audio) / sr
         audio_chunk = audio[start:end]
         audio_chunk = (audio_chunk * 32767).astype(np.float32)
+        result = pipe(audio_chunk, generate_kwargs={"language": language, "task": "transcribe"})
         transcription_txt += result["text"]
         for chunk in result["chunks"]:
             start_time, end_time = chunk["timestamp"]
+            if start_time is None:
+                start_time = 0
+            if end_time is None:
+                end_time = 0
             transcription_chunks.append({
                 "start": start_time + i * 30,
                 "end": end_time + i * 30,
                 "text": chunk["text"]
             })
     return transcription_txt, transcription_chunks
+def diarize_audio(audio_path, pipeline, max_speakers):
+    # Load the entire audio file
+    audio, sr = librosa.load(audio_path, sr=16000)
+    # Write the audio to a temporary file if needed for the pipeline
+    temp_audio_path = f"{audio_path}_temp.wav"
+    sf.write(temp_audio_path, audio, sr)
+    # Perform speaker diarization on the entire audio file
+    diarization = pipeline(temp_audio_path, num_speakers=max_speakers)
+    # Clean up the temporary file
+    os.remove(temp_audio_path)
+    torch.cuda.empty_cache()
+    gc.collect()
+    return diarization
+def create_combined_srt(transcription_chunks, diarization, output_path, max_speakers):
     speaker_segments = []
     speaker_durations = defaultdict(float)
         speaker_segments.append((segment.start, segment.end, speaker))
     sorted_speakers = sorted(speaker_durations.items(), key=lambda x: x[1], reverse=True)[:max_speakers]
     speaker_map = {}
     for i, (speaker, _) in enumerate(sorted_speakers, start=1):
         speaker_map[speaker] = f"Speaker {i}"
             duration_str = format_timestamp(duration).split('.')[0].lstrip('0')
             srt_file.write(f"Speaker {i} (originally {speaker}): total duration {duration_str}\n")
+def process_video(video_path, hf_token, language, max_speakers=3):
     base_name = os.path.splitext(video_path)[0]
     audio_path = f"{base_name}.wav"
     extract_audio(video_path, audio_path)
+    pipeline = lazy_diarization_pipeline.get_pipeline(hf_token)
+    diarization = diarize_audio(audio_path, pipeline, max_speakers)
     # Clear GPU memory after diarization
     torch.cuda.empty_cache()
     gc.collect()
+    transcription, chunks = transcribe_audio(audio_path, language)
     # Clear GPU memory after transcription
     torch.cuda.empty_cache()
     combined_srt_path = f"{base_name}_combined.srt"
     create_combined_srt(chunks, diarization, combined_srt_path, max_speakers)
     os.remove(audio_path)
     torch.cuda.empty_cache()
     gc.collect()
+    return combined_srt_path

visualization.py CHANGED Viewed

@@ -1,99 +1,70 @@
-import plotly.graph_objs as go
-import gradio as gr
-def create_bar_chart(data, title):
-    colors = ['red', 'green', 'blue', 'yellow', 'purple', 'orange', 'pink', 'cyan', 'magenta', 'brown']
-    fig = go.Figure(data=[go.Bar(
-        x=list(data.keys()),
-        y=list(data.values()),
-        marker_color=colors[:len(data)]
-    )])
-    fig.update_layout(title=title)
-    return fig
-def create_radar_chart(data, title):
-    ordered_keys = ['Self', 'Anxiety', 'Others', 'Avoidance']
-    ordered_values = [data.get(key, 0) for key in ordered_keys]
-    fig = go.Figure(data=go.Scatterpolar(
-        r=ordered_values,
-        theta=ordered_keys,
-        fill='toself'
-    ))
-    fig.update_layout(
-        polar=dict(radialaxis=dict(visible=True, range=[0, max(ordered_values, default=1)])),
-        showlegend=False,
-        title=title
-    )
-    return fig
-def update_visibility_and_charts(status, exec_time, lang, transcription, attachments_data, bigfive_data, personalities_data, chart_data, max_speakers):
-    outputs = [
-        gr.update(value=status, visible=True),
-        gr.update(value=exec_time, visible=True),
-        gr.update(value=lang, visible=True),
-        gr.update(value=transcription, visible=True),
-    ]
-    # Collect all unique speaker names from all datasets
-    all_speakers = set(attachments_data.keys()) | set(bigfive_data.keys()) | set(personalities_data.keys()) | set(chart_data.keys())
-    # Sort speakers and limit to max_speakers (up to 3)
-    sorted_speakers = sorted(all_speakers)[:min(max_speakers, 3)]
-    for speaker in sorted_speakers:
-        speaker_outputs = []
-        # Attachments
-        attachment_data = attachments_data.get(speaker, {})
-        if attachment_data:
-            attachment_chart_data = {k: v for k, v in attachment_data.items() if k in ["Secured", "Anxious-Preoccupied", "Dismissive-Avoidant", "Fearful-Avoidant"]}
-            attachment_chart = create_bar_chart(attachment_chart_data, f"Attachments Analysis - {speaker}")
-            speaker_outputs.append(gr.update(value=attachment_chart, visible=True))
-            anxiety_avoidance_data = {k: attachment_data.get(k, 0) for k in ["Self", "Anxiety", "Others", "Avoidance"]}
-            anxiety_avoidance_chart = create_radar_chart(anxiety_avoidance_data, f"Anxiety-Avoidance-Self-Others Analysis - {speaker}")
-            speaker_outputs.append(gr.update(value=anxiety_avoidance_chart, visible=True))
-            attachment_explanation = attachment_data.get("explanation", "No explanation provided.")
-            speaker_outputs.append(gr.update(value=attachment_explanation, visible=True))
-        else:
-            speaker_outputs.extend([gr.update(visible=False)] * 3)
-        # Big Five
-        bigfive_data_speaker = bigfive_data.get(speaker, {})
-        if bigfive_data_speaker:
-            bigfive_chart_data = {k: v for k, v in bigfive_data_speaker.items() if k not in ["explanation"] and isinstance(v, (int, float))}
-            bigfive_chart = create_bar_chart(bigfive_chart_data, f"Big Five Analysis - {speaker}")
-            speaker_outputs.append(gr.update(value=bigfive_chart, visible=True))
-            bigfive_explanation = bigfive_data_speaker.get("explanation", "No explanation provided.")
-            speaker_outputs.append(gr.update(value=bigfive_explanation, visible=True))
-        else:
-            speaker_outputs.extend([gr.update(visible=False)] * 2)
-        # Personalities
-        personalities_data_speaker = personalities_data.get(speaker, {})
-        if personalities_data_speaker:
-            personalities_chart_data = {k: v for k, v in personalities_data_speaker.items() if k not in ["explanation"] and isinstance(v, (int, float))}
-            personalities_chart = create_bar_chart(personalities_chart_data, f"Personalities Analysis - {speaker}")
-            speaker_outputs.append(gr.update(value=personalities_chart, visible=True))
-            personalities_explanation = personalities_data_speaker.get("explanation", "No explanation provided.")
-            speaker_outputs.append(gr.update(value=personalities_explanation, visible=True))
-        else:
-            speaker_outputs.extend([gr.update(visible=False)] * 2)
-        outputs.extend(speaker_outputs)
-    # Hide unused speaker components
-    for _ in range(3 - len(sorted_speakers)):
-        outputs.extend([gr.update(visible=False)] * 7)  # 7 components per speaker
-    print("Debug: Attachments Data:", attachments_data)
-    print("Debug: Big Five Data:", bigfive_data)
-    print("Debug: Personalities Data:", personalities_data)
-    print("Debug: Chart Data:", chart_data)
-    print("Debug: Sorted Speakers:", sorted_speakers)
-    return outputs

+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+def create_charts(results):
+    charts = {}
+    explanations = {}
+    # Loop through each detected speaker in the results
+    for speaker_id, speaker_data in results['speakers'].items():
+        speaker_charts = {}
+        speaker_explanations = {}
+        # Attachment Styles for each speaker
+        attachment_data = speaker_data['attachments']
+        fig_attachment = go.Figure(go.Bar(
+            x=['Secured', 'Anxious-Preoccupied', 'Dismissive-Avoidant', 'Fearful-Avoidant'],
+            y=[attachment_data.secured, attachment_data.anxious_preoccupied,
+               attachment_data.dismissive_avoidant, attachment_data.fearful_avoidant],
+            marker_color=['blue', 'orange', 'green', 'red']
+        ))
+        fig_attachment.update_layout(title_text=f"Attachment Styles - Speaker {speaker_id}", showlegend=False)
+        speaker_charts["attachment"] = fig_attachment
+        speaker_explanations["attachment"] = attachment_data.explanation
+        # Attachment Dimensions (Radar Chart) for each speaker
+        fig_dimensions = go.Figure(go.Scatterpolar(
+            r=[attachment_data.avoidance, attachment_data.anxiety, attachment_data.self_rating, attachment_data.others_rating],
+            theta=['Avoidance', 'Anxiety', 'Self', 'Others'],
+            fill='toself'
+        ))
+        fig_dimensions.update_layout(title_text=f"Attachment Dimensions - Speaker {speaker_id}", showlegend=False)
+        speaker_charts["dimensions"] = fig_dimensions
+        # Big Five Traits for each speaker
+        bigfive_data = speaker_data['bigfive']
+        fig_bigfive = go.Figure(go.Bar(
+            x=['Extraversion', 'Agreeableness', 'Conscientiousness', 'Neuroticism', 'Openness'],
+            y=[bigfive_data.extraversion, bigfive_data.agreeableness,
+               bigfive_data.conscientiousness, bigfive_data.neuroticism, bigfive_data.openness],
+            marker_color=['blue', 'green', 'red', 'purple', 'orange']
+        ))
+        fig_bigfive.update_layout(title_text=f"Big Five Traits - Speaker {speaker_id}", showlegend=False)
+        speaker_charts["bigfive"] = fig_bigfive
+        speaker_explanations["bigfive"] = bigfive_data.explanation
+        # Personality Disorders for each speaker
+        personality_data = speaker_data['personalities']
+        fig_personality = go.Figure(go.Bar(
+            x=['Antisocial', 'Narcissistic', 'Depressed', 'Anxious-Avoidant',
+               'Obsessive', 'Paranoid', 'Borderline', 'Dependent', 'Schizoid-Schizotypal'],
+            y=[personality_data.antisocial_psychopathic, personality_data.narcissistic,
+               personality_data.depressed, personality_data.anxious_avoidant,
+               personality_data.obsessional, personality_data.paranoid,
+               personality_data.borderline_dysregulated, personality_data.dependent_victimized,
+               personality_data.schizoid_schizotypal],
+            marker_color=['black', 'orange', 'gray', 'green', 'brown', 'purple', 'red', 'cyan', 'magenta']
+        ))
+        fig_personality.update_layout(title_text=f"Personality Disorders - Speaker {speaker_id}", showlegend=False)
+        speaker_charts["personality"] = fig_personality
+        speaker_explanations["personality"] = personality_data.explanation
+        # Update all charts to take full width
+        for fig in speaker_charts.values():
+            fig.update_layout(height=400, width=None, margin=dict(l=50, r=50, t=100, b=50))
+        # Store the charts and explanations for each speaker
+        charts[speaker_id] = speaker_charts
+        explanations[speaker_id] = speaker_explanations
+    return charts, explanations