Spaces:

reab5555
/

Multiple-Speakers-Personality-Analyzer

Runtime error

App Files Files Community

reab5555 commited on Aug 5, 2024

Commit

5e7b13d

verified ·

1 Parent(s): ec6ecc2

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -311

app.py CHANGED Viewed

@@ -1,301 +1,6 @@
-import os
 import gradio as gr
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
-from langchain_community.llms import HuggingFacePipeline
-from langchain_community.document_loaders import TextLoader, PyPDFLoader
-from langchain.text_splitter import CharacterTextSplitter
-from langchain_community.vectorstores import FAISS
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain.prompts import PromptTemplate
-from langchain.chains import RetrievalQA
-from huggingface_hub import login
-import diarization
-import shutil
-import spaces
-import time
-from langdetect import detect
-import plotly.graph_objs as go
-import re
-from collections import Counter
-# Set environment variable to disable tokenizers parallelism warning
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# Get Hugging Face token from Space secret
-hf_token = os.environ.get('hf_secret')
-if not hf_token:
-    raise ValueError("HF_TOKEN not found in environment variables. Please set it in the Space secrets.")
-# Login to Hugging Face
-login(token=hf_token)
-# Language detection function
-def detect_language(text):
-    try:
-        return detect(text)
-    except:
-        return "en"  # default to English if detection fails
-# Lazy initialization for the pipeline
-class LazyPipeline:
-    def __init__(self):
-        self.pipeline = None
-    @spaces.GPU(duration=250)
-    def get_pipeline(self):
-        if self.pipeline is None:
-            import torch
-            model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
-            model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                torch_dtype=torch.float16,
-                device_map="auto",
-            )
-            self.pipeline = pipeline(
-                "text-generation",
-                model=model,
-                tokenizer=tokenizer,
-                max_new_tokens=4096,
-                temperature=0.8,
-            )
-        return self.pipeline
-lazy_pipe = LazyPipeline()
-# Create a LangChain wrapper around the pipeline
-class LazyLLM:
-    def __init__(self, lazy_pipeline):
-        self.lazy_pipeline = lazy_pipeline
-        self.llm = None
-    @spaces.GPU(duration=150)
-    def get_llm(self):
-        if self.llm is None:
-            pipe = self.lazy_pipeline.get_pipeline()
-            self.llm = HuggingFacePipeline(pipeline=pipe)
-        return self.llm
-lazy_llm = LazyLLM(lazy_pipe)
-# Load instruction files
-def load_instructions(file_path):
-    with open(file_path, 'r') as file:
-        return file.read().strip()
-attachments_task = load_instructions("tasks/Attachments_task.txt")
-bigfive_task = load_instructions("tasks/BigFive_task.txt")
-personalities_task = load_instructions("tasks/Personalities_task.txt")
-# Load knowledge files
-def load_knowledge(file_path):
-    loader = TextLoader(file_path)
-    documents = loader.load()
-    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
-    texts = text_splitter.split_documents(documents)
-    return texts
-attachments_knowledge = load_knowledge("knowledge/bartholomew_attachments_definitions - no int.txt")
-bigfive_knowledge = load_knowledge("knowledge/bigfive_definitions.txt")
-personalities_knowledge = load_knowledge("knowledge/personalities_definitions.txt")
-# Create vector stores
-embeddings = HuggingFaceEmbeddings()
-attachments_db = FAISS.from_documents(attachments_knowledge, embeddings)
-bigfive_db = FAISS.from_documents(bigfive_knowledge, embeddings)
-personalities_db = FAISS.from_documents(personalities_knowledge, embeddings)
-# Lazy initialization for retrieval chains
-class LazyChains:
-    def __init__(self, lazy_llm):
-        self.lazy_llm = lazy_llm
-        self.attachments_chain = None
-        self.bigfive_chain = None
-        self.personalities_chain = None
-    def create_prompt(self, task):
-        return PromptTemplate(
-            template=task + "\n\nContext: {context}\n\nTask: {question}\n\n-----------\n\nAnswer: ",
-            input_variables=["context", "question"]
-        )
-    @spaces.GPU(duration=200)
-    def get_chains(self):
-        if self.attachments_chain is None:
-            llm = self.lazy_llm.get_llm()
-            self.attachments_chain = RetrievalQA.from_chain_type(
-                llm=llm,
-                chain_type="stuff",
-                retriever=attachments_db.as_retriever(),
-                chain_type_kwargs={"prompt": self.create_prompt(attachments_task)}
-            )
-            self.bigfive_chain = RetrievalQA.from_chain_type(
-                llm=llm,
-                chain_type="stuff",
-                retriever=bigfive_db.as_retriever(),
-                chain_type_kwargs={"prompt": self.create_prompt(bigfive_task)}
-            )
-            self.personalities_chain = RetrievalQA.from_chain_type(
-                llm=llm,
-                chain_type="stuff",
-                retriever=personalities_db.as_retriever(),
-                chain_type_kwargs={"prompt": self.create_prompt(personalities_task)}
-            )
-        return self.attachments_chain, self.bigfive_chain, self.personalities_chain
-lazy_chains = LazyChains(lazy_llm)
-def count_words_and_tokens(text):
-    words = len(text.split())
-    tokens = len(AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3").tokenize(text))
-    return words, tokens
-@spaces.GPU(duration=150)
-def process_input(input_file, progress=gr.Progress()):
-    start_time = time.time()
-    progress(0, desc="Processing file...")
-    file_extension = os.path.splitext(input_file.name)[1].lower()
-    if file_extension == '.txt':
-        with open(input_file.name, 'r', encoding='utf-8') as file:
-            content = file.read()
-        words, tokens = count_words_and_tokens(content)
-        input_info = f"Text file processed. Words: {words}, Tokens: {tokens}"
-    elif file_extension == '.pdf':
-        loader = PyPDFLoader(input_file.name)
-        pages = loader.load_and_split()
-        content = '\n'.join([page.page_content for page in pages])
-        words, tokens = count_words_and_tokens(content)
-        input_info = f"PDF file processed. Words: {words}, Tokens: {tokens}"
-    elif file_extension in ['.mp4', '.avi', '.mov']:
-        temp_video_path = "temp_video" + file_extension
-        shutil.copy2(input_file.name, temp_video_path)
-        progress(0.2, desc="Transcribing video...")
-        language = "en"  # Default to English for video files
-        diarization.process_video(temp_video_path, hf_token, language)
-        srt_path = temp_video_path.replace(file_extension, "_combined.srt")
-        with open(srt_path, 'r', encoding='utf-8') as file:
-            content = file.read()
-        words, tokens = count_words_and_tokens(content)
-        input_info = f"Input Words: {words} / Input Tokens: {tokens}"
-    else:
-        return "Unsupported file format. Please upload a TXT, PDF, or video file.", None, None, None, None, None, None
-    detected_language = detect_language(content)
-    progress(0.4, desc="Analyzing content...")
-    attachments_chain, bigfive_chain, personalities_chain = lazy_chains.get_chains()
-    progress(0.6, desc="Analyzing attachments...")
-    attachments_result = attachments_chain({"query": content})
-    attachments_answer = attachments_result['result'].split("-----------\n\nAnswer:")[-1].strip()
-    progress(0.7, desc="Analyzing Big Five traits...")
-    bigfive_result = bigfive_chain({"query": content})
-    bigfive_answer = bigfive_result['result'].split("-----------\n\nAnswer:")[-1].strip()
-    progress(0.8, desc="Analyzing personalities...")
-    personalities_result = personalities_chain({"query": content})
-    personalities_answer = personalities_result['result'].split("-----------\n\nAnswer:")[-1].strip()
-    end_time = time.time()
-    execution_time = end_time - start_time
-    execution_info = f"{execution_time:.2f} seconds"
-    progress(1.0, desc="Analysis complete!")
-    print("Attachments answer:", attachments_answer)
-    print("Big Five answer:", bigfive_answer)
-    print("Personalities answer:", personalities_answer)
-    return ("Analysis complete!", execution_info, detected_language, input_info,
-            attachments_answer, bigfive_answer, personalities_answer)
-def extract_speaker_data(text):
-    speakers = {}
-    current_speaker = None
-    for line in text.split('\n'):
-        if line.lower().startswith("speaker"):
-            current_speaker = line.split(":")[1].strip() if ":" in line else line.split()[1]
-            speakers[current_speaker] = {}
-        elif current_speaker and ":" in line:
-            key, value = line.split(":", 1)
-            try:
-                speakers[current_speaker][key.strip()] = float(value.strip())
-            except ValueError:
-                # If conversion to float fails, try to extract a number from the string
-                match = re.search(r"[-+]?\d*\.\d+|\d+", value)
-                if match:
-                    speakers[current_speaker][key.strip()] = float(match.group())
-    return speakers
-def create_bar_chart(data, title, speaker):
-    fig = go.Figure(data=[go.Bar(
-        x=list(data.keys()),
-        y=list(data.values()),
-        marker_color=['red', 'green', 'blue', 'yellow', 'purple', 'orange', 'pink', 'cyan', 'magenta', 'brown'][:len(data)]
-    )])
-    fig.update_layout(title=f"{title} - Speaker {speaker}", xaxis_title="Traits", yaxis_title="Score")
-    return fig
-def update_visibility_and_charts(status, exec_time, lang, info, attachments, bigfive, personalities):
-    print("Attachments output:", attachments)
-    print("Big Five output:", bigfive)
-    print("Personalities output:", personalities)
-    charts = []
-    if not any([attachments, bigfive, personalities]):
-        print("No data available for chart creation.")
-        return [
-            gr.update(value="No data available for analysis. Please try again with a different input.", visible=True),
-            gr.update(value=exec_time, visible=True),
-            gr.update(value=lang, visible=True),
-            gr.update(value=info, visible=True),
-        ] + []  # No charts to return
-    for analysis_text in [attachments, bigfive, personalities]:
-        speakers_data = extract_speaker_data(analysis_text)
-        if not speakers_data:
-            print(f"No speaker data extracted from: {analysis_text}")
-        # Determine the two main speakers
-        speaker_counts = Counter(speakers_data.keys())
-        main_speakers = [speaker for speaker, count in speaker_counts.most_common(2)]
-        for speaker in main_speakers:
-            data = speakers_data.get(speaker, {})
-            attachment_data = {k: v for k, v in data.items() if k in ["Secured", "Anxious-Preoccupied", "Dismissive-Avoidant", "Fearful-Avoidant"]}
-            if attachment_data:
-                charts.append(create_bar_chart(attachment_data, "Attachment Styles", speaker))
-            bigfive_data = {k: v for k, v in data.items() if k in ["Extraversion", "Agreeableness", "Conscientiousness", "Neuroticism", "Openness"]}
-            if bigfive_data:
-                charts.append(create_bar_chart(bigfive_data, "Big Five Traits", speaker))
-            personality_data = {k: v for k, v in data.items() if k in ["Depressed", "Paranoid", "Schizoid-Schizotypal", "Antisocial-Psychopathic", "Borderline-Dysregulated", "Hysteric-Histrionic", "Narcissistic", "Anxious-Avoidant", "Dependent-Victimized", "Obsessional"]}
-            if personality_data:
-                charts.append(create_bar_chart(personality_data, "Personality Traits", speaker))
-            self_others_data = {k: v for k, v in data.items() if k in ["Self", "Others", "Anxiety", "Avoidance"]}
-            if self_others_data:
-                charts.append(create_bar_chart(self_others_data, "Self-Others and Anxiety-Avoidance", speaker))
-    print("Number of charts created:", len(charts))
-    return [
-        gr.update(value=status, visible=True),
-        gr.update(value=exec_time, visible=True),
-        gr.update(value=lang, visible=True),
-        gr.update(value=info, visible=True),
-    ] + charts
 def create_interface():
     with gr.Blocks() as iface:
@@ -312,25 +17,16 @@ def create_interface():
             detected_language = gr.Textbox(label="Detected Language", visible=False)
             input_info = gr.Textbox(label="Input Information", visible=False)
-            # Hidden textboxes for storing model outputs
             attachments_output = gr.Textbox(visible=False)
             bigfive_output = gr.Textbox(visible=False)
             personalities_output = gr.Textbox(visible=False)
-            # Container for dynamically created charts
             chart_container = gr.Column()
         def process_and_update(input_file):
-            # First, process the input
             results = process_input(input_file)
-            # Then, create and update charts
             chart_outputs = update_visibility_and_charts(*results)
-            # Create new chart components based on the number of charts
-            new_charts = [gr.Plot(visible=True) for _ in range(len(chart_outputs) - 4)]  # -4 for the non-chart outputs
-            # Update the chart container
             return chart_outputs[:4] + [gr.Column(new_charts)]
         input_file.upload(
@@ -340,8 +36,6 @@ def create_interface():
         )
     return iface
-iface = create_interface()
-# Launch the app
-iface.launch()

 import gradio as gr
+from file_processing import process_input
+from chart_creation import update_visibility_and_charts
 def create_interface():
     with gr.Blocks() as iface:
             detected_language = gr.Textbox(label="Detected Language", visible=False)
             input_info = gr.Textbox(label="Input Information", visible=False)
             attachments_output = gr.Textbox(visible=False)
             bigfive_output = gr.Textbox(visible=False)
             personalities_output = gr.Textbox(visible=False)
             chart_container = gr.Column()
         def process_and_update(input_file):
             results = process_input(input_file)
             chart_outputs = update_visibility_and_charts(*results)
+            new_charts = [gr.Plot(visible=True) for _ in range(len(chart_outputs) - 4)]
             return chart_outputs[:4] + [gr.Column(new_charts)]
         input_file.upload(
         )
     return iface
+iface = create_interface()
+iface.launch()