Sa-m commited on
Commit
c405013
·
verified ·
1 Parent(s): c3b0b01

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -150
app.py CHANGED
@@ -3,18 +3,19 @@ import matplotlib.pyplot as plt
3
  import nltk
4
  from nltk.tokenize import word_tokenize, sent_tokenize
5
  from nltk.corpus import stopwords
6
- from nltk.stem import WordNetLemmatizer
7
  from nltk.text import Text
8
  from nltk.probability import FreqDist
9
  from cleantext import clean
10
- import textract
 
11
  import urllib.request
12
  from io import BytesIO
13
  import sys
14
  import pandas as pd
15
- import cv2
16
  import re
17
- from wordcloud import WordCloud, ImageColorGenerator
18
  from textblob import TextBlob
19
  from PIL import Image
20
  import os
@@ -28,37 +29,14 @@ import unidecode
28
  import contractions
29
  from sklearn.feature_extraction.text import TfidfVectorizer
30
 
31
-
32
  load_dotenv()
33
- import nltk
34
- import ssl
35
-
36
- def ensure_nltk_resources():
37
- try:
38
- nltk.data.find('tokenizers/punkt')
39
- nltk.data.find('corpora/stopwords')
40
- except LookupError:
41
- print("NLTK resources not found. Downloading...")
42
- try:
43
- # Handling potential SSL issues (common on some systems)
44
- _create_unverified_https_context = ssl._create_unverified_context
45
- except AttributeError:
46
- pass
47
- else:
48
- ssl._create_default_https_context = _create_unverified_https_context
49
-
50
- nltk.download(['stopwords', 'wordnet', 'words'])
51
- nltk.download('punkt')
52
- nltk.download('punkt_tab')
53
- print("NLTK resources downloaded successfully.")
54
-
55
-
56
- ensure_nltk_resources()
57
 
58
  # Download NLTK resources (Ensure this runs once or handle caching)
59
  # nltk.download(['stopwords', 'wordnet', 'words'])
60
  # nltk.download('punkt')
61
  # nltk.download('punkt_tab')
 
62
  # Initialize Groq client
63
  groq_api_key = os.getenv("GROQ_API_KEY")
64
  groq_client = groq.Groq(api_key=groq_api_key) if groq_api_key else None
@@ -68,16 +46,36 @@ stop_words = set(stopwords.words('english'))
68
  stop_words.update({'ask', 'much', 'thank', 'etc.', 'e', 'We', 'In', 'ed', 'pa', 'This', 'also', 'A', 'fu', 'To', '5', 'ing', 'er', '2'}) # Ensure stop_words is a set
69
 
70
  # --- Parsing & Preprocessing Functions ---
 
71
  def Parsing(parsed_text):
 
 
 
72
  try:
 
73
  if hasattr(parsed_text, 'name'):
74
  file_path = parsed_text.name
75
  else:
 
76
  file_path = parsed_text
77
- # Ensure textract handles encoding correctly or handle errors
78
- raw_party = textract.process(file_path) # Removed encoding/method for broader compatibility
79
- decoded_text = raw_party.decode('utf-8', errors='ignore') # Decode bytes to string, handling errors
80
- return clean(decoded_text) # Pass decoded string to clean
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  except Exception as e:
82
  print(f"Error parsing PDF: {e}")
83
  return f"Error parsing PDF: {e}"
@@ -104,8 +102,9 @@ def Preprocess(textParty):
104
  def generate_summary(text):
105
  if not groq_client:
106
  return "Summarization is not available. Please set up your GROQ_API_KEY in the .env file."
107
- if len(text) > 10000:
108
- text = text[:10000]
 
109
  try:
110
  completion = groq_client.chat.completions.create(
111
  model="llama3-8b-8192", # Or your preferred model
@@ -120,6 +119,61 @@ def generate_summary(text):
120
  except Exception as e:
121
  return f"Error generating summary: {str(e)}"
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  def fDistance(text2Party):
124
  word_tokens_party = word_tokenize(text2Party)
125
  fdistance = FreqDist(word_tokens_party).most_common(10)
@@ -162,7 +216,7 @@ def normalize(d, target=1.0):
162
  return {key: value * factor for key, value in d.items()}
163
 
164
  # --- Visualization Functions with Error Handling ---
165
-
166
  def safe_plot(func, *args, **kwargs):
167
  """Executes a plotting function and returns the image, handling errors."""
168
  buf = None # Initialize buffer
@@ -195,21 +249,19 @@ def safe_plot(func, *args, **kwargs):
195
  plt.close('all') # Aggressive close on error
196
  return None
197
 
198
-
199
-
200
  def fDistancePlot(text2Party):
201
- """Generates the frequency distribution plot."""
202
  def plot_func():
203
  tokens = word_tokenize(text2Party)
204
  if not tokens:
205
- plt.text(0.5, 0.5, "No data to plot", ha='center', va='center', transform=plt.gca().transAxes) # Use Axes coordinates
206
- return
207
  fdist = FreqDist(tokens)
208
  fdist.plot(15, title='Frequency Distribution')
209
- plt.xticks(rotation=45, ha='right')
210
  plt.tight_layout()
211
  return safe_plot(plot_func)
212
 
 
213
  def DispersionPlot(textParty):
214
  """Generates the word dispersion plot."""
215
  buf = None # Initialize buffer
@@ -232,7 +284,7 @@ def DispersionPlot(textParty):
232
  print("Warning: No common words found for dispersion plot.")
233
  return None
234
 
235
- # --- Key Fix: Manage figure explicitly without passing 'ax' ---
236
  fig = plt.figure(figsize=(10, 5)) # Create figure explicitly
237
  plt.title('Dispersion Plot')
238
  # Call dispersion_plot without 'ax' argument
@@ -264,16 +316,17 @@ def DispersionPlot(textParty):
264
  plt.close('all') # Aggressive close on error
265
  return None # Return None on error
266
 
267
-
268
  def word_cloud_generator(parsed_text_name, text_Party):
269
  """Generates the word cloud image."""
270
  buf = None # Initialize buffer
271
  try:
 
272
  filename_lower = ""
273
  if hasattr(parsed_text_name, 'name') and parsed_text_name.name:
274
  filename_lower = parsed_text_name.name.lower()
275
  elif isinstance(parsed_text_name, str):
276
- filename_lower = parsed_text_name.lower()
277
 
278
  mask_path = None
279
  if 'bjp' in filename_lower:
@@ -283,16 +336,18 @@ def word_cloud_generator(parsed_text_name, text_Party):
283
  elif 'aap' in filename_lower:
284
  mask_path = 'aapMain2.jpg'
285
 
 
286
  if text_Party.strip() == "":
287
- raise ValueError("Text for word cloud is empty")
288
 
289
  # Generate word cloud object
290
  if mask_path and os.path.exists(mask_path):
291
  orgImg = Image.open(mask_path)
 
292
  if orgImg.mode != 'RGB':
293
  orgImg = orgImg.convert('RGB')
294
  mask = np.array(orgImg)
295
- wordcloud = WordCloud(max_words=3000, mask=mask, background_color='white', mode='RGBA').generate(text_Party)
296
  else:
297
  wordcloud = WordCloud(max_words=2000, background_color='white', mode='RGBA').generate(text_Party)
298
 
@@ -305,13 +360,13 @@ def word_cloud_generator(parsed_text_name, text_Party):
305
  buf = BytesIO()
306
  # Handle potential apply_aspect error for word cloud too
307
  try:
308
- fig.savefig(buf, format='png', bbox_inches='tight', dpi=300, facecolor='white') # Added dpi and facecolor
309
  except AttributeError as ae:
310
  if "apply_aspect" in str(ae):
311
  print(f"Warning: bbox_inches='tight' failed for Word Cloud ({ae}), saving without it.")
312
  buf.seek(0)
313
  buf = BytesIO()
314
- fig.savefig(buf, format='png', dpi=300, facecolor='white')
315
  else:
316
  raise
317
  buf.seek(0)
@@ -327,109 +382,23 @@ def word_cloud_generator(parsed_text_name, text_Party):
327
  plt.close('all') # Aggressive close on error
328
  return None # Return None on error
329
 
330
-
331
- # Initial design for concordance based search
332
- def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin=10, right_margin=10, numLins=4):
333
- """
334
- Function to get all the phrases that contain the target word in a text/passage.
335
- """
336
- if not target_word or target_word.strip() == "":
337
- return "Please enter a search term"
338
- tokens = nltk.word_tokenize(tar_passage)
339
- text = nltk.Text(tokens)
340
- c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower())
341
- offsets = c.offsets(target_word)
342
- if not offsets:
343
- return f"Word '{target_word}' not found."
344
- concordance_txt = [
345
- text.tokens[max(0, offset - left_margin):offset + right_margin]
346
- for offset in offsets[:numLins]
347
- ]
348
- result = [' '.join(con_sub) for con_sub in concordance_txt]
349
- return '\n'.join(result) # Use newline for better readability in textbox
350
-
351
-
352
- def get_contextual_search_result(target_word, tar_passage, groq_client_instance, max_context_length=8000):
353
- """
354
- Uses the LLM to provide contextual information about the target word within the passage.
355
- """
356
- if not target_word or target_word.strip() == "":
357
- return "Please enter a search term."
358
-
359
- if not groq_client_instance:
360
- return "Contextual search requires the LLM API. Please set up your GROQ_API_KEY."
361
-
362
- # Basic check if word exists (optional, LLM can handle it too)
363
- # Simple check, might generate false positives/negatives
364
- # if target_word.lower() not in tar_passage.lower():
365
- # return f"The term '{target_word}' was not found in the manifesto text."
366
-
367
- # Truncate passage if too long for the model/context window
368
- original_length = len(tar_passage)
369
- if original_length > max_context_length:
370
- # Simple truncation; could be improved to ensure sentences are complete
371
- tar_passage_truncated = tar_passage[:max_context_length]
372
- print(f"Warning: Passage truncated for LLM search context from {original_length} to {max_context_length} characters.")
373
- else:
374
- tar_passage_truncated = tar_passage
375
-
376
- # --- Improved Prompt ---
377
- prompt = f"""
378
- You are an expert political analyst. You have been given a section of a political manifesto and a specific search term.
379
- Your task is to extract and summarize all information related to the search term from the provided text.
380
- Focus on:
381
- 1. Specific policies, promises, or statements related to the term.
382
- 2. The context in which the term is used.
383
- 3. Any key details, figures, or commitments mentioned.
384
- Present your findings concisely. If the term is not relevant or not found in the provided text section, state that clearly.
385
- Search Term: {target_word}
386
- Manifesto Text Section:
387
- {tar_passage_truncated}
388
- Relevant Information:
389
- """
390
-
391
- try:
392
- completion = groq_client_instance.chat.completions.create(
393
- model="llama3-8b-8192", # Use the same or a suitable model
394
- messages=[
395
- {"role": "system", "content": "You are a helpful assistant skilled at analyzing political texts and extracting relevant information based on a search query. Provide clear, concise summaries."},
396
- {"role": "user", "content": prompt}
397
- ],
398
- temperature=0.2, # Low temperature for more factual extraction
399
- max_tokens=1000 # Adjust based on expected output length
400
- )
401
- result = completion.choices[0].message.content.strip()
402
- # Add a note if the input was truncated
403
- if original_length > max_context_length:
404
- result = f"(Note: Analysis based on the first {max_context_length} characters of the manifesto.)\n\n" + result
405
- return result if result else f"No specific context for '{target_word}' could be generated from the provided text section."
406
- except Exception as e:
407
- error_msg = f"Error during contextual search for '{target_word}': {str(e)}"
408
- print(error_msg)
409
- traceback.print_exc()
410
- # Fallback to concordance if LLM fails?
411
- # return get_all_phases_containing_tar_wrd(target_word, tar_passage)
412
- return error_msg # Or return the error message directly
413
-
414
-
415
  def analysis(Manifesto, Search):
416
  try:
417
  if Manifesto is None:
418
- # Ensure return order matches the outputs list
419
  return "No file uploaded", {}, None, None, None, None, None, "No file uploaded"
420
  if Search.strip() == "":
421
  Search = "government"
422
- raw_party = Parsing(Manifesto)
423
  if isinstance(raw_party, str) and raw_party.startswith("Error"):
424
  return raw_party, {}, None, None, None, None, None, "Parsing failed"
425
  text_Party = clean_text(raw_party)
426
  text_Party_processed = Preprocess(text_Party)
427
 
428
  # --- Perform Search FIRST using the ORIGINAL text for better context ---
429
- # Pass the original raw text for richer context to the LLM
430
  searChRes = get_contextual_search_result(Search, raw_party, groq_client)
431
 
432
- # --- Then proceed with other analyses ---
433
  summary = generate_summary(raw_party) # Use raw_party for summary for more context?
434
 
435
  # --- Sentiment Analysis ---
@@ -450,9 +419,10 @@ def analysis(Manifesto, Search):
450
  sentiment_plot = safe_plot(lambda: df_dummy['Polarity_Label'].value_counts().plot(kind='bar', color="#FF9F45", title='Sentiment Analysis'))
451
  subjectivity_plot = safe_plot(lambda: df_dummy['Subjectivity_Label'].value_counts().plot(kind='bar', color="#B667F1", title='Subjectivity Analysis'))
452
  freq_plot = fDistancePlot(text_Party_processed)
453
- dispersion_plot = DispersionPlot(text_Party_processed) # Use fixed version
454
- wordcloud = word_cloud_generator(Manifesto, text_Party_processed) # Pass Manifesto object itself
455
  fdist_Party = fDistance(text_Party_processed)
 
456
 
457
  return searChRes, fdist_Party, sentiment_plot, subjectivity_plot, wordcloud, freq_plot, dispersion_plot, summary
458
 
@@ -463,8 +433,8 @@ def analysis(Manifesto, Search):
463
  # Return error messages/images in the correct order
464
  return error_msg, {}, None, None, None, None, None, "Analysis failed"
465
 
466
-
467
- # --- Gradio Interface (remains largely the same, just ensuring output variable names match) ---
468
  with gr.Blocks(title='Manifesto Analysis') as demo:
469
  gr.Markdown("# Manifesto Analysis")
470
  # Input Section
@@ -481,9 +451,8 @@ with gr.Blocks(title='Manifesto Analysis') as demo:
481
  with gr.TabItem("Summary"):
482
  summary_output = gr.Textbox(label='AI-Generated Summary', lines=10, interactive=False)
483
 
484
- # --- Search Results Tab ---
485
  with gr.TabItem("Search Results"):
486
- # Use the specific output variable defined in the layout
487
  search_output = gr.Textbox(label='Context Based Search Results', lines=15, interactive=False, max_lines=20) # Increased lines/max_lines
488
 
489
  # --- Key Topics Tab ---
@@ -515,7 +484,7 @@ with gr.Blocks(title='Manifesto Analysis') as demo:
515
  fn=analysis,
516
  inputs=[file_input, search_input],
517
  outputs=[
518
- search_output, # 1 (Now contextual)
519
  topics_output, # 2
520
  sentiment_output, # 3
521
  subjectivity_output, # 4
@@ -528,7 +497,6 @@ with gr.Blocks(title='Manifesto Analysis') as demo:
528
  )
529
 
530
  # --- Examples ---
531
- # Ensure outputs list references the PREDEFINED components from the layout
532
  gr.Examples(
533
  examples=[
534
  ["Example/AAP_Manifesto_2019.pdf", "government"],
@@ -536,11 +504,10 @@ with gr.Blocks(title='Manifesto Analysis') as demo:
536
  ["Example/Congress_Manifesto_2019.pdf", "safety"]
537
  ],
538
  inputs=[file_input, search_input],
539
- # --- Key Fix: Reference the predefined output components ---
540
- outputs=[search_output, topics_output, sentiment_output, subjectivity_output, wordcloud_output, freq_output, dispersion_output, summary_output],
541
  fn=analysis # Run analysis on example click
542
  )
543
 
544
-
545
  if __name__ == "__main__":
546
- demo.launch(debug=True, share=False, show_error=True)
 
3
  import nltk
4
  from nltk.tokenize import word_tokenize, sent_tokenize
5
  from nltk.corpus import stopwords
6
+ # from nltk.stem import WordNetLemmatizer # Not used, commented out
7
  from nltk.text import Text
8
  from nltk.probability import FreqDist
9
  from cleantext import clean
10
+ # import textract # Replaced by PyPDF2
11
+ import PyPDF2 # Added for PDF parsing
12
  import urllib.request
13
  from io import BytesIO
14
  import sys
15
  import pandas as pd
16
+ # import cv2 # Not used, commented out
17
  import re
18
+ from wordcloud import WordCloud # , ImageColorGenerator # ImageColorGenerator not used, commented out
19
  from textblob import TextBlob
20
  from PIL import Image
21
  import os
 
29
  import contractions
30
  from sklearn.feature_extraction.text import TfidfVectorizer
31
 
32
+ # Load environment variables
33
  load_dotenv()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  # Download NLTK resources (Ensure this runs once or handle caching)
36
  # nltk.download(['stopwords', 'wordnet', 'words'])
37
  # nltk.download('punkt')
38
  # nltk.download('punkt_tab')
39
+
40
  # Initialize Groq client
41
  groq_api_key = os.getenv("GROQ_API_KEY")
42
  groq_client = groq.Groq(api_key=groq_api_key) if groq_api_key else None
 
46
  stop_words.update({'ask', 'much', 'thank', 'etc.', 'e', 'We', 'In', 'ed', 'pa', 'This', 'also', 'A', 'fu', 'To', '5', 'ing', 'er', '2'}) # Ensure stop_words is a set
47
 
48
  # --- Parsing & Preprocessing Functions ---
49
+ # --- Replaced textract with PyPDF2 ---
50
  def Parsing(parsed_text):
51
+ """
52
+ Parses text from a PDF file using PyPDF2.
53
+ """
54
  try:
55
+ # Get the file path from the Gradio UploadFile object
56
  if hasattr(parsed_text, 'name'):
57
  file_path = parsed_text.name
58
  else:
59
+ # Fallback if it's somehow just a string path
60
  file_path = parsed_text
61
+
62
+ # Use PyPDF2 to read the PDF
63
+ text = ""
64
+ with open(file_path, 'rb') as pdf_file: # Open in binary read mode
65
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
66
+ for page_num in range(len(pdf_reader.pages)):
67
+ page = pdf_reader.pages[page_num]
68
+ text += page.extract_text() + "\n" # Add newline between pages
69
+
70
+ # Clean the extracted text
71
+ return clean(text)
72
+
73
+ except FileNotFoundError:
74
+ print(f"Error parsing PDF: File not found at path: {file_path}")
75
+ return f"Error parsing PDF: File not found. Please check the file upload."
76
+ except PyPDF2.errors.PdfReadError as pre:
77
+ print(f"Error reading PDF: {pre}")
78
+ return f"Error reading PDF: The file might be corrupted or password-protected."
79
  except Exception as e:
80
  print(f"Error parsing PDF: {e}")
81
  return f"Error parsing PDF: {e}"
 
102
  def generate_summary(text):
103
  if not groq_client:
104
  return "Summarization is not available. Please set up your GROQ_API_KEY in the .env file."
105
+ # Adjusted truncation length for potentially better summary context
106
+ if len(text) > 15000:
107
+ text = text[:15000]
108
  try:
109
  completion = groq_client.chat.completions.create(
110
  model="llama3-8b-8192", # Or your preferred model
 
119
  except Exception as e:
120
  return f"Error generating summary: {str(e)}"
121
 
122
+ # --- New LLM-based Search Function ---
123
+ def get_contextual_search_result(target_word, tar_passage, groq_client_instance, max_context_length=8000):
124
+ """
125
+ Uses the LLM to provide contextual information about the target word within the passage.
126
+ """
127
+ if not target_word or target_word.strip() == "":
128
+ return "Please enter a search term."
129
+
130
+ if not groq_client_instance:
131
+ return "Contextual search requires the LLM API. Please set up your GROQ_API_KEY."
132
+
133
+ # Truncate passage if too long for the model/context window
134
+ original_length = len(tar_passage)
135
+ if original_length > max_context_length:
136
+ tar_passage_truncated = tar_passage[:max_context_length]
137
+ print(f"Warning: Passage truncated for LLM search context from {original_length} to {max_context_length} characters.")
138
+ else:
139
+ tar_passage_truncated = tar_passage
140
+
141
+ # --- Improved Prompt ---
142
+ prompt = f"""
143
+ You are an expert political analyst. You have been given a section of a political manifesto and a specific search term.
144
+ Your task is to extract and summarize all information related to the search term from the provided text.
145
+ Focus on:
146
+ 1. Specific policies, promises, or statements related to the term.
147
+ 2. The context in which the term is used.
148
+ 3. Any key details, figures, or commitments mentioned.
149
+ Present your findings concisely. If the term is not relevant or not found in the provided text section, state that clearly.
150
+ Search Term: {target_word}
151
+ Manifesto Text Section:
152
+ {tar_passage_truncated}
153
+ Relevant Information:
154
+ """
155
+
156
+ try:
157
+ completion = groq_client_instance.chat.completions.create(
158
+ model="llama3-8b-8192", # Use the same or a suitable model
159
+ messages=[
160
+ {"role": "system", "content": "You are a helpful assistant skilled at analyzing political texts and extracting relevant information based on a search query. Provide clear, concise summaries."},
161
+ {"role": "user", "content": prompt}
162
+ ],
163
+ temperature=0.2, # Low temperature for more factual extraction
164
+ max_tokens=1000 # Adjust based on expected output length
165
+ )
166
+ result = completion.choices[0].message.content.strip()
167
+ # Add a note if the input was truncated
168
+ if original_length > max_context_length:
169
+ result = f"(Note: Analysis based on the first {max_context_length} characters of the manifesto.)\n\n" + result
170
+ return result if result else f"No specific context for '{target_word}' could be generated from the provided text section."
171
+ except Exception as e:
172
+ error_msg = f"Error during contextual search for '{target_word}': {str(e)}"
173
+ print(error_msg)
174
+ traceback.print_exc()
175
+ return error_msg # Or return the error message directly
176
+
177
  def fDistance(text2Party):
178
  word_tokens_party = word_tokenize(text2Party)
179
  fdistance = FreqDist(word_tokens_party).most_common(10)
 
216
  return {key: value * factor for key, value in d.items()}
217
 
218
  # --- Visualization Functions with Error Handling ---
219
+ # --- Improved safe_plot to handle apply_aspect errors ---
220
  def safe_plot(func, *args, **kwargs):
221
  """Executes a plotting function and returns the image, handling errors."""
222
  buf = None # Initialize buffer
 
249
  plt.close('all') # Aggressive close on error
250
  return None
251
 
 
 
252
  def fDistancePlot(text2Party):
 
253
  def plot_func():
254
  tokens = word_tokenize(text2Party)
255
  if not tokens:
256
+ plt.text(0.5, 0.5, "No data to plot", ha='center', va='center')
257
+ return
258
  fdist = FreqDist(tokens)
259
  fdist.plot(15, title='Frequency Distribution')
260
+ plt.xticks(rotation=45, ha='right') # Rotate x-axis labels if needed
261
  plt.tight_layout()
262
  return safe_plot(plot_func)
263
 
264
+ # --- Updated DispersionPlot without passing 'ax' ---
265
  def DispersionPlot(textParty):
266
  """Generates the word dispersion plot."""
267
  buf = None # Initialize buffer
 
284
  print("Warning: No common words found for dispersion plot.")
285
  return None
286
 
287
+ # --- Manage figure explicitly without passing 'ax' ---
288
  fig = plt.figure(figsize=(10, 5)) # Create figure explicitly
289
  plt.title('Dispersion Plot')
290
  # Call dispersion_plot without 'ax' argument
 
316
  plt.close('all') # Aggressive close on error
317
  return None # Return None on error
318
 
319
+ # --- Updated word_cloud_generator with robust figure handling ---
320
  def word_cloud_generator(parsed_text_name, text_Party):
321
  """Generates the word cloud image."""
322
  buf = None # Initialize buffer
323
  try:
324
+ # Handle case where parsed_text_name might not have .name
325
  filename_lower = ""
326
  if hasattr(parsed_text_name, 'name') and parsed_text_name.name:
327
  filename_lower = parsed_text_name.name.lower()
328
  elif isinstance(parsed_text_name, str):
329
+ filename_lower = parsed_text_name.lower()
330
 
331
  mask_path = None
332
  if 'bjp' in filename_lower:
 
336
  elif 'aap' in filename_lower:
337
  mask_path = 'aapMain2.jpg'
338
 
339
+ # Generate word cloud
340
  if text_Party.strip() == "":
341
+ raise ValueError("Text for word cloud is empty")
342
 
343
  # Generate word cloud object
344
  if mask_path and os.path.exists(mask_path):
345
  orgImg = Image.open(mask_path)
346
+ # Ensure mask is in the right format (e.g., uint8)
347
  if orgImg.mode != 'RGB':
348
  orgImg = orgImg.convert('RGB')
349
  mask = np.array(orgImg)
350
+ wordcloud = WordCloud(max_words=3000, mask=mask, background_color='white', mode='RGBA').generate(text_Party) # Added mode='RGBA'
351
  else:
352
  wordcloud = WordCloud(max_words=2000, background_color='white', mode='RGBA').generate(text_Party)
353
 
 
360
  buf = BytesIO()
361
  # Handle potential apply_aspect error for word cloud too
362
  try:
363
+ fig.savefig(buf, format='png', bbox_inches='tight', dpi=150, facecolor='white') # Added dpi and facecolor
364
  except AttributeError as ae:
365
  if "apply_aspect" in str(ae):
366
  print(f"Warning: bbox_inches='tight' failed for Word Cloud ({ae}), saving without it.")
367
  buf.seek(0)
368
  buf = BytesIO()
369
+ fig.savefig(buf, format='png', dpi=150, facecolor='white')
370
  else:
371
  raise
372
  buf.seek(0)
 
382
  plt.close('all') # Aggressive close on error
383
  return None # Return None on error
384
 
385
+ # --- Main Analysis Function ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  def analysis(Manifesto, Search):
387
  try:
388
  if Manifesto is None:
 
389
  return "No file uploaded", {}, None, None, None, None, None, "No file uploaded"
390
  if Search.strip() == "":
391
  Search = "government"
392
+ raw_party = Parsing(Manifesto) # Uses PyPDF2 now
393
  if isinstance(raw_party, str) and raw_party.startswith("Error"):
394
  return raw_party, {}, None, None, None, None, None, "Parsing failed"
395
  text_Party = clean_text(raw_party)
396
  text_Party_processed = Preprocess(text_Party)
397
 
398
  # --- Perform Search FIRST using the ORIGINAL text for better context ---
399
+ # Use the new LLM-based search function
400
  searChRes = get_contextual_search_result(Search, raw_party, groq_client)
401
 
 
402
  summary = generate_summary(raw_party) # Use raw_party for summary for more context?
403
 
404
  # --- Sentiment Analysis ---
 
419
  sentiment_plot = safe_plot(lambda: df_dummy['Polarity_Label'].value_counts().plot(kind='bar', color="#FF9F45", title='Sentiment Analysis'))
420
  subjectivity_plot = safe_plot(lambda: df_dummy['Subjectivity_Label'].value_counts().plot(kind='bar', color="#B667F1", title='Subjectivity Analysis'))
421
  freq_plot = fDistancePlot(text_Party_processed)
422
+ dispersion_plot = DispersionPlot(text_Party_processed) # Uses updated version
423
+ wordcloud = word_cloud_generator(Manifesto, text_Party_processed) # Pass Manifesto object itself, uses updated version
424
  fdist_Party = fDistance(text_Party_processed)
425
+ # searChRes is now generated earlier using LLM
426
 
427
  return searChRes, fdist_Party, sentiment_plot, subjectivity_plot, wordcloud, freq_plot, dispersion_plot, summary
428
 
 
433
  # Return error messages/images in the correct order
434
  return error_msg, {}, None, None, None, None, None, "Analysis failed"
435
 
436
+ # --- Gradio Interface ---
437
+ # Use Blocks for custom layout
438
  with gr.Blocks(title='Manifesto Analysis') as demo:
439
  gr.Markdown("# Manifesto Analysis")
440
  # Input Section
 
451
  with gr.TabItem("Summary"):
452
  summary_output = gr.Textbox(label='AI-Generated Summary', lines=10, interactive=False)
453
 
454
+ # --- Search Results Tab (uses LLM output now) ---
455
  with gr.TabItem("Search Results"):
 
456
  search_output = gr.Textbox(label='Context Based Search Results', lines=15, interactive=False, max_lines=20) # Increased lines/max_lines
457
 
458
  # --- Key Topics Tab ---
 
484
  fn=analysis,
485
  inputs=[file_input, search_input],
486
  outputs=[
487
+ search_output, # 1 (Now contextual LLM output)
488
  topics_output, # 2
489
  sentiment_output, # 3
490
  subjectivity_output, # 4
 
497
  )
498
 
499
  # --- Examples ---
 
500
  gr.Examples(
501
  examples=[
502
  ["Example/AAP_Manifesto_2019.pdf", "government"],
 
504
  ["Example/Congress_Manifesto_2019.pdf", "safety"]
505
  ],
506
  inputs=[file_input, search_input],
507
+ outputs=[search_output, topics_output, sentiment_output, subjectivity_output, wordcloud_output, freq_output, dispersion_output, summary_output], # Link examples to outputs
 
508
  fn=analysis # Run analysis on example click
509
  )
510
 
511
+ # Launch the app
512
  if __name__ == "__main__":
513
+ demo.launch(debug=True, share=False, show_error=True)