reab5555 commited on
Commit
ddf0a26
·
verified ·
1 Parent(s): 2bb0128

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -0
app.py CHANGED
@@ -1,3 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
  @spaces.GPU(duration=150)
3
  def process_input(input_file):
 
1
+ import os
2
+ import gradio as gr
3
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
4
+ from langchain_community.llms import HuggingFacePipeline
5
+ from langchain_community.document_loaders import TextLoader, PyPDFLoader
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain_community.embeddings import HuggingFaceEmbeddings
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain.chains import RetrievalQA
11
+ from huggingface_hub import login
12
+ import diarization
13
+ import shutil
14
+ import spaces
15
+ import time
16
+ from langdetect import detect
17
+
18
+ # Set environment variable to disable tokenizers parallelism warning
19
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
20
+
21
+ # Get Hugging Face token from Space secret
22
+ hf_token = os.environ.get('hf_secret')
23
+ if not hf_token:
24
+ raise ValueError("HF_TOKEN not found in environment variables. Please set it in the Space secrets.")
25
+
26
+ # Login to Hugging Face
27
+ login(token=hf_token)
28
+
29
+ # Language detection function
30
+ def detect_language(text):
31
+ try:
32
+ return detect(text)
33
+ except:
34
+ return "en" # default to English if detection fails
35
+
36
+ # Lazy initialization for the pipeline
37
+ class LazyPipeline:
38
+ def __init__(self):
39
+ self.pipeline = None
40
+
41
+ @spaces.GPU(duration=250)
42
+ def get_pipeline(self):
43
+ if self.pipeline is None:
44
+ import torch
45
+ model_name = "mistralai/Mistral-7B-Instruct-v0.3"
46
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
47
+ model = AutoModelForCausalLM.from_pretrained(
48
+ model_name,
49
+ torch_dtype=torch.float16,
50
+ device_map="auto",
51
+ )
52
+ self.pipeline = pipeline(
53
+ "text-generation",
54
+ model=model,
55
+ tokenizer=tokenizer,
56
+ max_length = 4000,
57
+ max_new_tokens=512,
58
+ temperature=0.1,
59
+ )
60
+ return self.pipeline
61
+
62
+ lazy_pipe = LazyPipeline()
63
+
64
+ # Create a LangChain wrapper around the pipeline
65
+ class LazyLLM:
66
+ def __init__(self, lazy_pipeline):
67
+ self.lazy_pipeline = lazy_pipeline
68
+ self.llm = None
69
+
70
+ @spaces.GPU(duration=150)
71
+ def get_llm(self):
72
+ if self.llm is None:
73
+ pipe = self.lazy_pipeline.get_pipeline()
74
+ self.llm = HuggingFacePipeline(pipeline=pipe)
75
+ return self.llm
76
+
77
+ lazy_llm = LazyLLM(lazy_pipe)
78
+
79
+ # Load instruction files
80
+ def load_instructions(file_path):
81
+ with open(file_path, 'r') as file:
82
+ return file.read().strip()
83
+
84
+ attachments_task = load_instructions("tasks/Attachments_task.txt")
85
+ bigfive_task = load_instructions("tasks/BigFive_task.txt")
86
+ personalities_task = load_instructions("tasks/Personalities_task.txt")
87
+
88
+ # Load knowledge files
89
+ def load_knowledge(file_path):
90
+ loader = TextLoader(file_path)
91
+ documents = loader.load()
92
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
93
+ texts = text_splitter.split_documents(documents)
94
+ return texts
95
+
96
+ attachments_knowledge = load_knowledge("knowledge/bartholomew_attachments_definitions - no int.txt")
97
+ bigfive_knowledge = load_knowledge("knowledge/bigfive_definitions.txt")
98
+ personalities_knowledge = load_knowledge("knowledge/personalities_definitions.txt")
99
+
100
+ # Create vector stores
101
+ embeddings = HuggingFaceEmbeddings()
102
+ attachments_db = FAISS.from_documents(attachments_knowledge, embeddings)
103
+ bigfive_db = FAISS.from_documents(bigfive_knowledge, embeddings)
104
+ personalities_db = FAISS.from_documents(personalities_knowledge, embeddings)
105
+
106
+ # Lazy initialization for retrieval chains
107
+ class LazyChains:
108
+ def __init__(self, lazy_llm):
109
+ self.lazy_llm = lazy_llm
110
+ self.attachments_chain = None
111
+ self.bigfive_chain = None
112
+ self.personalities_chain = None
113
+
114
+ def create_prompt(self, task):
115
+ return PromptTemplate(
116
+ template=task + "\n\nContext: {context}\n\nTask: {question}\n\n-----------\n\nAnswer: ",
117
+ input_variables=["context", "question"]
118
+ )
119
+
120
+ @spaces.GPU(duration=200)
121
+ def get_chains(self):
122
+ if self.attachments_chain is None:
123
+ llm = self.lazy_llm.get_llm()
124
+ self.attachments_chain = RetrievalQA.from_chain_type(
125
+ llm=llm,
126
+ chain_type="stuff",
127
+ retriever=attachments_db.as_retriever(),
128
+ chain_type_kwargs={"prompt": self.create_prompt(attachments_task)}
129
+ )
130
+ self.bigfive_chain = RetrievalQA.from_chain_type(
131
+ llm=llm,
132
+ chain_type="stuff",
133
+ retriever=bigfive_db.as_retriever(),
134
+ chain_type_kwargs={"prompt": self.create_prompt(bigfive_task)}
135
+ )
136
+ self.personalities_chain = RetrievalQA.from_chain_type(
137
+ llm=llm,
138
+ chain_type="stuff",
139
+ retriever=personalities_db.as_retriever(),
140
+ chain_type_kwargs={"prompt": self.create_prompt(personalities_task)}
141
+ )
142
+ return self.attachments_chain, self.bigfive_chain, self.personalities_chain
143
+
144
+ lazy_chains = LazyChains(lazy_llm)
145
+
146
+ @spaces.GPU(duration=150)
147
+
148
+ def count_words_and_tokens(text):
149
+ words = len(text.split())
150
+ tokens = len(AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3").tokenize(text))
151
+ return words, tokens
152
+
153
+
154
 
155
  @spaces.GPU(duration=150)
156
  def process_input(input_file):