reab5555 commited on
Commit
5e7b13d
·
verified ·
1 Parent(s): ec6ecc2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -311
app.py CHANGED
@@ -1,301 +1,6 @@
1
- import os
2
  import gradio as gr
3
- from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
4
- from langchain_community.llms import HuggingFacePipeline
5
- from langchain_community.document_loaders import TextLoader, PyPDFLoader
6
- from langchain.text_splitter import CharacterTextSplitter
7
- from langchain_community.vectorstores import FAISS
8
- from langchain_community.embeddings import HuggingFaceEmbeddings
9
- from langchain.prompts import PromptTemplate
10
- from langchain.chains import RetrievalQA
11
- from huggingface_hub import login
12
- import diarization
13
- import shutil
14
- import spaces
15
- import time
16
- from langdetect import detect
17
- import plotly.graph_objs as go
18
- import re
19
- from collections import Counter
20
-
21
- # Set environment variable to disable tokenizers parallelism warning
22
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
23
-
24
- # Get Hugging Face token from Space secret
25
- hf_token = os.environ.get('hf_secret')
26
- if not hf_token:
27
- raise ValueError("HF_TOKEN not found in environment variables. Please set it in the Space secrets.")
28
-
29
- # Login to Hugging Face
30
- login(token=hf_token)
31
-
32
- # Language detection function
33
- def detect_language(text):
34
- try:
35
- return detect(text)
36
- except:
37
- return "en" # default to English if detection fails
38
-
39
- # Lazy initialization for the pipeline
40
- class LazyPipeline:
41
- def __init__(self):
42
- self.pipeline = None
43
-
44
- @spaces.GPU(duration=250)
45
- def get_pipeline(self):
46
- if self.pipeline is None:
47
- import torch
48
- model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
49
- tokenizer = AutoTokenizer.from_pretrained(model_name)
50
- model = AutoModelForCausalLM.from_pretrained(
51
- model_name,
52
- torch_dtype=torch.float16,
53
- device_map="auto",
54
- )
55
- self.pipeline = pipeline(
56
- "text-generation",
57
- model=model,
58
- tokenizer=tokenizer,
59
- max_new_tokens=4096,
60
- temperature=0.8,
61
- )
62
- return self.pipeline
63
-
64
- lazy_pipe = LazyPipeline()
65
-
66
- # Create a LangChain wrapper around the pipeline
67
- class LazyLLM:
68
- def __init__(self, lazy_pipeline):
69
- self.lazy_pipeline = lazy_pipeline
70
- self.llm = None
71
-
72
- @spaces.GPU(duration=150)
73
- def get_llm(self):
74
- if self.llm is None:
75
- pipe = self.lazy_pipeline.get_pipeline()
76
- self.llm = HuggingFacePipeline(pipeline=pipe)
77
- return self.llm
78
-
79
- lazy_llm = LazyLLM(lazy_pipe)
80
-
81
- # Load instruction files
82
- def load_instructions(file_path):
83
- with open(file_path, 'r') as file:
84
- return file.read().strip()
85
-
86
- attachments_task = load_instructions("tasks/Attachments_task.txt")
87
- bigfive_task = load_instructions("tasks/BigFive_task.txt")
88
- personalities_task = load_instructions("tasks/Personalities_task.txt")
89
-
90
- # Load knowledge files
91
- def load_knowledge(file_path):
92
- loader = TextLoader(file_path)
93
- documents = loader.load()
94
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
95
- texts = text_splitter.split_documents(documents)
96
- return texts
97
-
98
- attachments_knowledge = load_knowledge("knowledge/bartholomew_attachments_definitions - no int.txt")
99
- bigfive_knowledge = load_knowledge("knowledge/bigfive_definitions.txt")
100
- personalities_knowledge = load_knowledge("knowledge/personalities_definitions.txt")
101
-
102
- # Create vector stores
103
- embeddings = HuggingFaceEmbeddings()
104
- attachments_db = FAISS.from_documents(attachments_knowledge, embeddings)
105
- bigfive_db = FAISS.from_documents(bigfive_knowledge, embeddings)
106
- personalities_db = FAISS.from_documents(personalities_knowledge, embeddings)
107
-
108
- # Lazy initialization for retrieval chains
109
- class LazyChains:
110
- def __init__(self, lazy_llm):
111
- self.lazy_llm = lazy_llm
112
- self.attachments_chain = None
113
- self.bigfive_chain = None
114
- self.personalities_chain = None
115
-
116
- def create_prompt(self, task):
117
- return PromptTemplate(
118
- template=task + "\n\nContext: {context}\n\nTask: {question}\n\n-----------\n\nAnswer: ",
119
- input_variables=["context", "question"]
120
- )
121
-
122
- @spaces.GPU(duration=200)
123
- def get_chains(self):
124
- if self.attachments_chain is None:
125
- llm = self.lazy_llm.get_llm()
126
- self.attachments_chain = RetrievalQA.from_chain_type(
127
- llm=llm,
128
- chain_type="stuff",
129
- retriever=attachments_db.as_retriever(),
130
- chain_type_kwargs={"prompt": self.create_prompt(attachments_task)}
131
- )
132
- self.bigfive_chain = RetrievalQA.from_chain_type(
133
- llm=llm,
134
- chain_type="stuff",
135
- retriever=bigfive_db.as_retriever(),
136
- chain_type_kwargs={"prompt": self.create_prompt(bigfive_task)}
137
- )
138
- self.personalities_chain = RetrievalQA.from_chain_type(
139
- llm=llm,
140
- chain_type="stuff",
141
- retriever=personalities_db.as_retriever(),
142
- chain_type_kwargs={"prompt": self.create_prompt(personalities_task)}
143
- )
144
- return self.attachments_chain, self.bigfive_chain, self.personalities_chain
145
-
146
- lazy_chains = LazyChains(lazy_llm)
147
-
148
- def count_words_and_tokens(text):
149
- words = len(text.split())
150
- tokens = len(AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3").tokenize(text))
151
- return words, tokens
152
-
153
- @spaces.GPU(duration=150)
154
- def process_input(input_file, progress=gr.Progress()):
155
- start_time = time.time()
156
-
157
- progress(0, desc="Processing file...")
158
-
159
- file_extension = os.path.splitext(input_file.name)[1].lower()
160
-
161
- if file_extension == '.txt':
162
- with open(input_file.name, 'r', encoding='utf-8') as file:
163
- content = file.read()
164
- words, tokens = count_words_and_tokens(content)
165
- input_info = f"Text file processed. Words: {words}, Tokens: {tokens}"
166
- elif file_extension == '.pdf':
167
- loader = PyPDFLoader(input_file.name)
168
- pages = loader.load_and_split()
169
- content = '\n'.join([page.page_content for page in pages])
170
- words, tokens = count_words_and_tokens(content)
171
- input_info = f"PDF file processed. Words: {words}, Tokens: {tokens}"
172
- elif file_extension in ['.mp4', '.avi', '.mov']:
173
- temp_video_path = "temp_video" + file_extension
174
- shutil.copy2(input_file.name, temp_video_path)
175
-
176
- progress(0.2, desc="Transcribing video...")
177
-
178
- language = "en" # Default to English for video files
179
- diarization.process_video(temp_video_path, hf_token, language)
180
-
181
- srt_path = temp_video_path.replace(file_extension, "_combined.srt")
182
- with open(srt_path, 'r', encoding='utf-8') as file:
183
- content = file.read()
184
- words, tokens = count_words_and_tokens(content)
185
- input_info = f"Input Words: {words} / Input Tokens: {tokens}"
186
- else:
187
- return "Unsupported file format. Please upload a TXT, PDF, or video file.", None, None, None, None, None, None
188
-
189
- detected_language = detect_language(content)
190
-
191
- progress(0.4, desc="Analyzing content...")
192
-
193
- attachments_chain, bigfive_chain, personalities_chain = lazy_chains.get_chains()
194
-
195
- progress(0.6, desc="Analyzing attachments...")
196
- attachments_result = attachments_chain({"query": content})
197
- attachments_answer = attachments_result['result'].split("-----------\n\nAnswer:")[-1].strip()
198
-
199
- progress(0.7, desc="Analyzing Big Five traits...")
200
- bigfive_result = bigfive_chain({"query": content})
201
- bigfive_answer = bigfive_result['result'].split("-----------\n\nAnswer:")[-1].strip()
202
-
203
- progress(0.8, desc="Analyzing personalities...")
204
- personalities_result = personalities_chain({"query": content})
205
- personalities_answer = personalities_result['result'].split("-----------\n\nAnswer:")[-1].strip()
206
-
207
- end_time = time.time()
208
- execution_time = end_time - start_time
209
-
210
- execution_info = f"{execution_time:.2f} seconds"
211
-
212
- progress(1.0, desc="Analysis complete!")
213
-
214
- print("Attachments answer:", attachments_answer)
215
- print("Big Five answer:", bigfive_answer)
216
- print("Personalities answer:", personalities_answer)
217
-
218
- return ("Analysis complete!", execution_info, detected_language, input_info,
219
- attachments_answer, bigfive_answer, personalities_answer)
220
-
221
- def extract_speaker_data(text):
222
- speakers = {}
223
- current_speaker = None
224
- for line in text.split('\n'):
225
- if line.lower().startswith("speaker"):
226
- current_speaker = line.split(":")[1].strip() if ":" in line else line.split()[1]
227
- speakers[current_speaker] = {}
228
- elif current_speaker and ":" in line:
229
- key, value = line.split(":", 1)
230
- try:
231
- speakers[current_speaker][key.strip()] = float(value.strip())
232
- except ValueError:
233
- # If conversion to float fails, try to extract a number from the string
234
- match = re.search(r"[-+]?\d*\.\d+|\d+", value)
235
- if match:
236
- speakers[current_speaker][key.strip()] = float(match.group())
237
- return speakers
238
-
239
- def create_bar_chart(data, title, speaker):
240
- fig = go.Figure(data=[go.Bar(
241
- x=list(data.keys()),
242
- y=list(data.values()),
243
- marker_color=['red', 'green', 'blue', 'yellow', 'purple', 'orange', 'pink', 'cyan', 'magenta', 'brown'][:len(data)]
244
- )])
245
- fig.update_layout(title=f"{title} - Speaker {speaker}", xaxis_title="Traits", yaxis_title="Score")
246
- return fig
247
-
248
- def update_visibility_and_charts(status, exec_time, lang, info, attachments, bigfive, personalities):
249
- print("Attachments output:", attachments)
250
- print("Big Five output:", bigfive)
251
- print("Personalities output:", personalities)
252
-
253
- charts = []
254
-
255
- if not any([attachments, bigfive, personalities]):
256
- print("No data available for chart creation.")
257
- return [
258
- gr.update(value="No data available for analysis. Please try again with a different input.", visible=True),
259
- gr.update(value=exec_time, visible=True),
260
- gr.update(value=lang, visible=True),
261
- gr.update(value=info, visible=True),
262
- ] + [] # No charts to return
263
-
264
- for analysis_text in [attachments, bigfive, personalities]:
265
- speakers_data = extract_speaker_data(analysis_text)
266
- if not speakers_data:
267
- print(f"No speaker data extracted from: {analysis_text}")
268
-
269
- # Determine the two main speakers
270
- speaker_counts = Counter(speakers_data.keys())
271
- main_speakers = [speaker for speaker, count in speaker_counts.most_common(2)]
272
-
273
- for speaker in main_speakers:
274
- data = speakers_data.get(speaker, {})
275
- attachment_data = {k: v for k, v in data.items() if k in ["Secured", "Anxious-Preoccupied", "Dismissive-Avoidant", "Fearful-Avoidant"]}
276
- if attachment_data:
277
- charts.append(create_bar_chart(attachment_data, "Attachment Styles", speaker))
278
-
279
- bigfive_data = {k: v for k, v in data.items() if k in ["Extraversion", "Agreeableness", "Conscientiousness", "Neuroticism", "Openness"]}
280
- if bigfive_data:
281
- charts.append(create_bar_chart(bigfive_data, "Big Five Traits", speaker))
282
-
283
- personality_data = {k: v for k, v in data.items() if k in ["Depressed", "Paranoid", "Schizoid-Schizotypal", "Antisocial-Psychopathic", "Borderline-Dysregulated", "Hysteric-Histrionic", "Narcissistic", "Anxious-Avoidant", "Dependent-Victimized", "Obsessional"]}
284
- if personality_data:
285
- charts.append(create_bar_chart(personality_data, "Personality Traits", speaker))
286
-
287
- self_others_data = {k: v for k, v in data.items() if k in ["Self", "Others", "Anxiety", "Avoidance"]}
288
- if self_others_data:
289
- charts.append(create_bar_chart(self_others_data, "Self-Others and Anxiety-Avoidance", speaker))
290
-
291
- print("Number of charts created:", len(charts))
292
-
293
- return [
294
- gr.update(value=status, visible=True),
295
- gr.update(value=exec_time, visible=True),
296
- gr.update(value=lang, visible=True),
297
- gr.update(value=info, visible=True),
298
- ] + charts
299
 
300
  def create_interface():
301
  with gr.Blocks() as iface:
@@ -312,25 +17,16 @@ def create_interface():
312
  detected_language = gr.Textbox(label="Detected Language", visible=False)
313
  input_info = gr.Textbox(label="Input Information", visible=False)
314
 
315
- # Hidden textboxes for storing model outputs
316
  attachments_output = gr.Textbox(visible=False)
317
  bigfive_output = gr.Textbox(visible=False)
318
  personalities_output = gr.Textbox(visible=False)
319
 
320
- # Container for dynamically created charts
321
  chart_container = gr.Column()
322
 
323
  def process_and_update(input_file):
324
- # First, process the input
325
  results = process_input(input_file)
326
-
327
- # Then, create and update charts
328
  chart_outputs = update_visibility_and_charts(*results)
329
-
330
- # Create new chart components based on the number of charts
331
- new_charts = [gr.Plot(visible=True) for _ in range(len(chart_outputs) - 4)] # -4 for the non-chart outputs
332
-
333
- # Update the chart container
334
  return chart_outputs[:4] + [gr.Column(new_charts)]
335
 
336
  input_file.upload(
@@ -340,8 +36,6 @@ def create_interface():
340
  )
341
 
342
  return iface
343
-
344
- iface = create_interface()
345
 
346
- # Launch the app
347
- iface.launch()
 
 
1
  import gradio as gr
2
+ from file_processing import process_input
3
+ from chart_creation import update_visibility_and_charts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  def create_interface():
6
  with gr.Blocks() as iface:
 
17
  detected_language = gr.Textbox(label="Detected Language", visible=False)
18
  input_info = gr.Textbox(label="Input Information", visible=False)
19
 
 
20
  attachments_output = gr.Textbox(visible=False)
21
  bigfive_output = gr.Textbox(visible=False)
22
  personalities_output = gr.Textbox(visible=False)
23
 
 
24
  chart_container = gr.Column()
25
 
26
  def process_and_update(input_file):
 
27
  results = process_input(input_file)
 
 
28
  chart_outputs = update_visibility_and_charts(*results)
29
+ new_charts = [gr.Plot(visible=True) for _ in range(len(chart_outputs) - 4)]
 
 
 
 
30
  return chart_outputs[:4] + [gr.Column(new_charts)]
31
 
32
  input_file.upload(
 
36
  )
37
 
38
  return iface
 
 
39
 
40
+ iface = create_interface()
41
+ iface.launch()