Spaces:

reab5555
/

Multiple-Speakers-Personality-Analyzer

Runtime error

App Files Files Community

reab5555 commited on Aug 4, 2024

Commit

ddf0a26

verified ·

1 Parent(s): 2bb0128

Update app.py

Browse files

Files changed (1) hide show

app.py +153 -0

app.py CHANGED Viewed

@@ -1,3 +1,156 @@
 @spaces.GPU(duration=150)
 def process_input(input_file):

+import os
+import gradio as gr
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+from langchain_community.llms import HuggingFacePipeline
+from langchain_community.document_loaders import TextLoader, PyPDFLoader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.prompts import PromptTemplate
+from langchain.chains import RetrievalQA
+from huggingface_hub import login
+import diarization
+import shutil
+import spaces
+import time
+from langdetect import detect
+# Set environment variable to disable tokenizers parallelism warning
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Get Hugging Face token from Space secret
+hf_token = os.environ.get('hf_secret')
+if not hf_token:
+    raise ValueError("HF_TOKEN not found in environment variables. Please set it in the Space secrets.")
+# Login to Hugging Face
+login(token=hf_token)
+# Language detection function
+def detect_language(text):
+    try:
+        return detect(text)
+    except:
+        return "en"  # default to English if detection fails
+# Lazy initialization for the pipeline
+class LazyPipeline:
+    def __init__(self):
+        self.pipeline = None
+    @spaces.GPU(duration=250)
+    def get_pipeline(self):
+        if self.pipeline is None:
+            import torch
+            model_name = "mistralai/Mistral-7B-Instruct-v0.3"
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16,
+                device_map="auto",
+            )
+            self.pipeline = pipeline(
+                "text-generation",
+                model=model,
+                tokenizer=tokenizer,
+                max_length = 4000,
+                max_new_tokens=512,
+                temperature=0.1,
+            )
+        return self.pipeline
+lazy_pipe = LazyPipeline()
+# Create a LangChain wrapper around the pipeline
+class LazyLLM:
+    def __init__(self, lazy_pipeline):
+        self.lazy_pipeline = lazy_pipeline
+        self.llm = None
+    @spaces.GPU(duration=150)
+    def get_llm(self):
+        if self.llm is None:
+            pipe = self.lazy_pipeline.get_pipeline()
+            self.llm = HuggingFacePipeline(pipeline=pipe)
+        return self.llm
+lazy_llm = LazyLLM(lazy_pipe)
+# Load instruction files
+def load_instructions(file_path):
+    with open(file_path, 'r') as file:
+        return file.read().strip()
+attachments_task = load_instructions("tasks/Attachments_task.txt")
+bigfive_task = load_instructions("tasks/BigFive_task.txt")
+personalities_task = load_instructions("tasks/Personalities_task.txt")
+# Load knowledge files
+def load_knowledge(file_path):
+    loader = TextLoader(file_path)
+    documents = loader.load()
+    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+    texts = text_splitter.split_documents(documents)
+    return texts
+attachments_knowledge = load_knowledge("knowledge/bartholomew_attachments_definitions - no int.txt")
+bigfive_knowledge = load_knowledge("knowledge/bigfive_definitions.txt")
+personalities_knowledge = load_knowledge("knowledge/personalities_definitions.txt")
+# Create vector stores
+embeddings = HuggingFaceEmbeddings()
+attachments_db = FAISS.from_documents(attachments_knowledge, embeddings)
+bigfive_db = FAISS.from_documents(bigfive_knowledge, embeddings)
+personalities_db = FAISS.from_documents(personalities_knowledge, embeddings)
+# Lazy initialization for retrieval chains
+class LazyChains:
+    def __init__(self, lazy_llm):
+        self.lazy_llm = lazy_llm
+        self.attachments_chain = None
+        self.bigfive_chain = None
+        self.personalities_chain = None
+    def create_prompt(self, task):
+        return PromptTemplate(
+            template=task + "\n\nContext: {context}\n\nTask: {question}\n\n-----------\n\nAnswer: ",
+            input_variables=["context", "question"]
+        )
+    @spaces.GPU(duration=200)
+    def get_chains(self):
+        if self.attachments_chain is None:
+            llm = self.lazy_llm.get_llm()
+            self.attachments_chain = RetrievalQA.from_chain_type(
+                llm=llm,
+                chain_type="stuff",
+                retriever=attachments_db.as_retriever(),
+                chain_type_kwargs={"prompt": self.create_prompt(attachments_task)}
+            )
+            self.bigfive_chain = RetrievalQA.from_chain_type(
+                llm=llm,
+                chain_type="stuff",
+                retriever=bigfive_db.as_retriever(),
+                chain_type_kwargs={"prompt": self.create_prompt(bigfive_task)}
+            )
+            self.personalities_chain = RetrievalQA.from_chain_type(
+                llm=llm,
+                chain_type="stuff",
+                retriever=personalities_db.as_retriever(),
+                chain_type_kwargs={"prompt": self.create_prompt(personalities_task)}
+            )
+        return self.attachments_chain, self.bigfive_chain, self.personalities_chain
+lazy_chains = LazyChains(lazy_llm)
+@spaces.GPU(duration=150)
+def count_words_and_tokens(text):
+    words = len(text.split())
+    tokens = len(AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3").tokenize(text))
+    return words, tokens
 @spaces.GPU(duration=150)
 def process_input(input_file):