Sa-m commited on
Commit
2cbc17e
·
verified ·
1 Parent(s): cbcdc2b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +488 -93
app.py CHANGED
@@ -28,21 +28,21 @@ import unidecode
28
  import contractions
29
  from sklearn.feature_extraction.text import TfidfVectorizer
30
 
31
-
32
  # Load environment variables
33
  load_dotenv()
34
 
35
- # Download NLTK resources
36
- nltk.download(['stopwords', 'wordnet', 'words'])
37
- nltk.download('punkt')
38
- nltk.download('punkt_tab')
 
39
  # Initialize Groq client
40
  groq_api_key = os.getenv("GROQ_API_KEY")
41
  groq_client = groq.Groq(api_key=groq_api_key) if groq_api_key else None
42
 
43
  # Stopwords customization
44
  stop_words = set(stopwords.words('english'))
45
- stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2')
46
 
47
  # --- Parsing & Preprocessing Functions ---
48
  def Parsing(parsed_text):
@@ -51,8 +51,10 @@ def Parsing(parsed_text):
51
  file_path = parsed_text.name
52
  else:
53
  file_path = parsed_text
54
- raw_party = textract.process(file_path, encoding='ascii', method='pdfminer')
55
- return clean(raw_party)
 
 
56
  except Exception as e:
57
  print(f"Error parsing PDF: {e}")
58
  return f"Error parsing PDF: {e}"
@@ -83,10 +85,10 @@ def generate_summary(text):
83
  text = text[:10000]
84
  try:
85
  completion = groq_client.chat.completions.create(
86
- model="llama3-8b-8192",
87
  messages=[
88
  {"role": "system", "content": "You are a helpful assistant that summarizes political manifestos. Provide a concise, objective summary that captures the key policy proposals, themes, and promises in the manifesto."},
89
- {"role": "user", "content": f"Please summarize the following political manifesto text in about 300-500 words, focusing on the main policy areas, promises, and themes:\n\n{text}"}
90
  ],
91
  temperature=0.3,
92
  max_tokens=800
@@ -99,25 +101,37 @@ def fDistance(text2Party):
99
  word_tokens_party = word_tokenize(text2Party)
100
  fdistance = FreqDist(word_tokens_party).most_common(10)
101
  mem = {x[0]: x[1] for x in fdistance}
102
-
103
  vectorizer = TfidfVectorizer(max_features=15, stop_words='english')
104
- tfidf_matrix = vectorizer.fit_transform(sent_tokenize(text2Party))
105
- feature_names = vectorizer.get_feature_names_out()
106
-
107
- tfidf_scores = {}
108
- for i, word in enumerate(feature_names):
109
- scores = [tfidf_matrix[j, i] for j in range(len(sent_tokenize(text2Party))) if i < tfidf_matrix[j].shape[1]]
110
- if scores:
111
- tfidf_scores[word] = sum(scores) / len(scores)
112
-
113
- combined_scores = {}
114
- for word in set(list(mem.keys()) + list(tfidf_scores.keys())):
115
- freq_score = mem.get(word, 0) / max(mem.values()) if mem else 0
116
- tfidf_score = tfidf_scores.get(word, 0) / max(tfidf_scores.values()) if tfidf_scores else 0
117
- combined_scores[word] = (freq_score * 0.3) + (tfidf_score * 0.7)
118
-
119
- top_words = dict(sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:10])
120
- return normalize(top_words)
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  def normalize(d, target=1.0):
123
  raw = sum(d.values())
@@ -130,64 +144,105 @@ def safe_plot(func, *args, **kwargs):
130
  plt.clf()
131
  func(*args, **kwargs)
132
  buf = BytesIO()
133
- plt.savefig(buf, format='png')
134
  buf.seek(0)
135
- return Image.open(buf)
 
 
136
  except Exception as e:
137
- print(f"Plotting error: {e}")
138
- return None
 
139
 
140
  def fDistancePlot(text2Party):
141
- return safe_plot(lambda: FreqDist(word_tokenize(text2Party)).plot(15, title='Frequency Distribution'))
 
 
 
 
 
 
 
 
 
142
 
143
  def DispersionPlot(textParty):
144
  try:
145
  word_tokens_party = word_tokenize(textParty)
146
- moby = Text(word_tokens_party) # Ensure Text is imported
 
 
147
  fdistance = FreqDist(word_tokens_party)
148
- word_Lst = [fdistance.most_common(6)[x][0] for x in range(5)]
149
- plt.figure(figsize=(4, 3))
 
 
 
 
 
 
 
 
 
150
  plt.title('Dispersion Plot')
151
  moby.dispersion_plot(word_Lst)
152
  plt.tight_layout()
153
  buf = BytesIO()
154
- plt.savefig(buf, format='png')
155
  buf.seek(0)
156
  img = Image.open(buf)
157
- plt.clf()
158
  return img
159
  except Exception as e:
160
  print(f"Dispersion plot error: {e}")
 
161
  return None
162
 
163
  def word_cloud_generator(parsed_text_name, text_Party):
164
  try:
165
- parsed = parsed_text_name.lower()
166
- if 'bjp' in parsed:
 
 
 
 
 
 
 
167
  mask_path = 'bjpImg2.jpeg'
168
- elif 'congress' in parsed:
169
  mask_path = 'congress3.jpeg'
170
- elif 'aap' in parsed:
171
  mask_path = 'aapMain2.jpg'
172
- else:
173
- mask_path = None
 
 
174
 
175
  if mask_path and os.path.exists(mask_path):
176
  orgImg = Image.open(mask_path)
 
 
 
177
  mask = np.array(orgImg)
178
- wordcloud = WordCloud(max_words=3000, mask=mask).generate(text_Party)
179
- plt.imshow(wordcloud)
180
  else:
181
- wordcloud = WordCloud(max_words=2000).generate(text_Party)
182
- plt.imshow(wordcloud)
 
 
183
  plt.axis("off")
 
184
  buf = BytesIO()
185
- plt.savefig(buf, format='png')
186
  buf.seek(0)
187
- return Image.open(buf)
 
 
188
  except Exception as e:
189
  print(f"Word cloud error: {e}")
190
- return None
 
191
 
192
  def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin=10, right_margin=10, numLins=4):
193
  """
@@ -195,19 +250,18 @@ def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin=10,
195
  """
196
  if not target_word or target_word.strip() == "":
197
  return "Please enter a search term"
198
-
199
  tokens = nltk.word_tokenize(tar_passage)
200
  text = nltk.Text(tokens)
201
  c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower())
202
  offsets = c.offsets(target_word)
203
-
 
204
  concordance_txt = [
205
  text.tokens[max(0, offset - left_margin):offset + right_margin]
206
  for offset in offsets[:numLins]
207
  ]
208
-
209
  result = [' '.join(con_sub) for con_sub in concordance_txt]
210
- return '\n'.join(result)
211
 
212
  # --- Main Analysis Function ---
213
  def analysis(Manifesto, Search):
@@ -216,27 +270,35 @@ def analysis(Manifesto, Search):
216
  return "No file uploaded", {}, None, None, None, None, None, "No file uploaded"
217
  if Search.strip() == "":
218
  Search = "government"
219
-
220
  raw_party = Parsing(Manifesto)
221
  if isinstance(raw_party, str) and raw_party.startswith("Error"):
222
  return raw_party, {}, None, None, None, None, None, "Parsing failed"
223
 
224
  text_Party = clean_text(raw_party)
225
  text_Party_processed = Preprocess(text_Party)
226
- summary = generate_summary(raw_party)
227
 
228
- df = pd.DataFrame([{'Content': text_Party_processed}], columns=['Content'])
229
- df['Subjectivity'] = df['Content'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
230
- df['Polarity'] = df['Content'].apply(lambda x: TextBlob(x).sentiment.polarity)
231
- df['Polarity_Label'] = df['Polarity'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')
232
- df['Subjectivity_Label'] = df['Subjectivity'].apply(lambda x: 'High' if x > 0.5 else 'Low')
233
 
234
- # Generate Plots with Safe Plotting
235
- sentiment_plot = safe_plot(lambda: df['Polarity_Label'].value_counts().plot(kind='bar', color="#FF9F45", title='Sentiment Analysis'))
236
- subjectivity_plot = safe_plot(lambda: df['Subjectivity_Label'].value_counts().plot(kind='bar', color="#B667F1", title='Subjectivity Analysis'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  freq_plot = fDistancePlot(text_Party_processed)
238
  dispersion_plot = DispersionPlot(text_Party_processed)
239
- wordcloud = word_cloud_generator(Manifesto.name, text_Party_processed)
240
 
241
  fdist_Party = fDistance(text_Party_processed)
242
  searChRes = get_all_phases_containing_tar_wrd(Search, text_Party_processed)
@@ -244,55 +306,388 @@ def analysis(Manifesto, Search):
244
  return searChRes, fdist_Party, sentiment_plot, subjectivity_plot, wordcloud, freq_plot, dispersion_plot, summary
245
 
246
  except Exception as e:
247
- error_msg = f"Critical error: {str(e)}"
248
  print(error_msg)
249
  traceback.print_exc()
 
250
  return error_msg, {}, None, None, None, None, None, "Analysis failed"
251
 
252
- # --- Gradio Interface ---
253
- Search_txt = "text"
254
- filePdf = "file"
255
 
 
 
256
  with gr.Blocks(title='Manifesto Analysis') as demo:
257
  gr.Markdown("# Manifesto Analysis")
 
 
258
  with gr.Row():
259
- with gr.Column():
260
- file_input = gr.File(label="Upload Manifesto PDF", file_types=[".pdf"])
261
- search_input = gr.Textbox(label="Search Term", placeholder="Enter a term to search in the manifesto")
262
- submit_btn = gr.Button("Analyze Manifesto")
 
 
 
263
  with gr.Tabs():
264
- with gr.TabItem("Summary"): gr.Textbox(label='LLM Based Summary', lines=10)
265
- with gr.TabItem("Search Results"): gr.Textbox(label='Context Based Search')
266
- with gr.TabItem("Key Topics"): gr.Label(label="Most Relevant Topics (LLM Enhanced)")
 
 
 
 
 
 
 
 
 
 
267
  with gr.TabItem("Visualizations"):
268
- with gr.Row():
269
- gr.Image(label='Sentiment Analysis'), gr.Image(label='Subjectivity Analysis')
270
- with gr.Row():
271
- gr.Image(label='Word Cloud'), gr.Image(label='Frequency Distribution')
272
- gr.Image(label='Dispersion Plot')
 
 
 
 
 
 
 
273
 
 
 
 
 
 
 
274
  submit_btn.click(
275
  fn=analysis,
276
  inputs=[file_input, search_input],
277
  outputs=[
278
- gr.Textbox(label='Context Based Search'),
279
- gr.Label(label="Most Relevant Topics (LLM Enhanced)"),
280
- gr.Image(label='Sentiment Analysis'),
281
- gr.Image(label='Subjectivity Analysis'),
282
- gr.Image(label='Word Cloud'),
283
- gr.Image(label='Frequency Distribution'),
284
- gr.Image(label='Dispersion Plot'),
285
- gr.Textbox(label='AI-Generated Summary', lines=10)
286
- ]
 
287
  )
288
 
 
289
  gr.Examples(
290
  examples=[
291
  ["Example/AAP_Manifesto_2019.pdf", "government"],
292
  ["Example/Bjp_Manifesto_2019.pdf", "environment"],
293
  ["Example/Congress_Manifesto_2019.pdf", "safety"]
294
  ],
295
- inputs=[file_input, search_input]
 
 
296
  )
297
 
298
- demo.launch(debug=True, share=False, show_error=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  import contractions
29
  from sklearn.feature_extraction.text import TfidfVectorizer
30
 
 
31
  # Load environment variables
32
  load_dotenv()
33
 
34
+ # Download NLTK resources (Ensure this runs once or handle caching)
35
+ # nltk.download(['stopwords', 'wordnet', 'words'])
36
+ # nltk.download('punkt')
37
+ # nltk.download('punkt_tab')
38
+
39
  # Initialize Groq client
40
  groq_api_key = os.getenv("GROQ_API_KEY")
41
  groq_client = groq.Groq(api_key=groq_api_key) if groq_api_key else None
42
 
43
  # Stopwords customization
44
  stop_words = set(stopwords.words('english'))
45
+ stop_words.update({'ask', 'much', 'thank', 'etc.', 'e', 'We', 'In', 'ed', 'pa', 'This', 'also', 'A', 'fu', 'To', '5', 'ing', 'er', '2'}) # Ensure stop_words is a set
46
 
47
  # --- Parsing & Preprocessing Functions ---
48
  def Parsing(parsed_text):
 
51
  file_path = parsed_text.name
52
  else:
53
  file_path = parsed_text
54
+ # Ensure textract handles encoding correctly or handle errors
55
+ raw_party = textract.process(file_path) # Removed encoding/method for broader compatibility
56
+ decoded_text = raw_party.decode('utf-8', errors='ignore') # Decode bytes to string, handling errors
57
+ return clean(decoded_text) # Pass decoded string to clean
58
  except Exception as e:
59
  print(f"Error parsing PDF: {e}")
60
  return f"Error parsing PDF: {e}"
 
85
  text = text[:10000]
86
  try:
87
  completion = groq_client.chat.completions.create(
88
+ model="llama3-8b-8192", # Or your preferred model
89
  messages=[
90
  {"role": "system", "content": "You are a helpful assistant that summarizes political manifestos. Provide a concise, objective summary that captures the key policy proposals, themes, and promises in the manifesto."},
91
+ {"role": "user", "content": f"Please summarize the following political manifesto text in about 300-500 words, focusing on the main policy areas, promises, and themes:\n{text}"}
92
  ],
93
  temperature=0.3,
94
  max_tokens=800
 
101
  word_tokens_party = word_tokenize(text2Party)
102
  fdistance = FreqDist(word_tokens_party).most_common(10)
103
  mem = {x[0]: x[1] for x in fdistance}
 
104
  vectorizer = TfidfVectorizer(max_features=15, stop_words='english')
105
+ try:
106
+ tfidf_matrix = vectorizer.fit_transform(sent_tokenize(text2Party))
107
+ feature_names = vectorizer.get_feature_names_out()
108
+ tfidf_scores = {}
109
+ sentences = sent_tokenize(text2Party)
110
+ for i, word in enumerate(feature_names):
111
+ scores = []
112
+ for j in range(tfidf_matrix.shape[0]): # Iterate through sentences
113
+ if i < tfidf_matrix.shape[1]: # Check if word index is valid for this sentence vector
114
+ scores.append(tfidf_matrix[j, i])
115
+ if scores:
116
+ tfidf_scores[word] = sum(scores) / len(scores) # Average TF-IDF score across sentences
117
+ combined_scores = {}
118
+ all_words = set(list(mem.keys()) + list(tfidf_scores.keys()))
119
+ max_freq = max(mem.values()) if mem else 1
120
+ max_tfidf = max(tfidf_scores.values()) if tfidf_scores else 1
121
+ for word in all_words:
122
+ freq_score = mem.get(word, 0) / max_freq
123
+ tfidf_score = tfidf_scores.get(word, 0) / max_tfidf
124
+ combined_scores[word] = (freq_score * 0.3) + (tfidf_score * 0.7)
125
+ top_words = dict(sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:10])
126
+ return normalize(top_words)
127
+ except ValueError as ve: # Handle case where TF-IDF fails (e.g., empty after processing)
128
+ print(f"Warning: TF-IDF failed, using only frequency: {ve}")
129
+ # Fallback to just normalized frequency if TF-IDF fails
130
+ if mem:
131
+ max_freq = max(mem.values())
132
+ return {k: v / max_freq for k, v in list(mem.items())[:10]} # Return top 10 freq, normalized
133
+ else:
134
+ return {}
135
 
136
  def normalize(d, target=1.0):
137
  raw = sum(d.values())
 
144
  plt.clf()
145
  func(*args, **kwargs)
146
  buf = BytesIO()
147
+ plt.savefig(buf, format='png', bbox_inches='tight') # Add bbox_inches for better fit
148
  buf.seek(0)
149
+ img = Image.open(buf)
150
+ plt.close() # Use plt.close() instead of clf for better memory management after save
151
+ return img
152
  except Exception as e:
153
+ print(f"Plotting error in safe_plot: {e}")
154
+ traceback.print_exc() # Print traceback for debugging
155
+ return None # Return None on error
156
 
157
  def fDistancePlot(text2Party):
158
+ def plot_func():
159
+ tokens = word_tokenize(text2Party)
160
+ if not tokens:
161
+ plt.text(0.5, 0.5, "No data to plot", ha='center', va='center')
162
+ return
163
+ fdist = FreqDist(tokens)
164
+ fdist.plot(15, title='Frequency Distribution')
165
+ plt.xticks(rotation=45, ha='right') # Rotate x-axis labels if needed
166
+ plt.tight_layout()
167
+ return safe_plot(plot_func)
168
 
169
  def DispersionPlot(textParty):
170
  try:
171
  word_tokens_party = word_tokenize(textParty)
172
+ if not word_tokens_party:
173
+ return None
174
+ moby = Text(word_tokens_party)
175
  fdistance = FreqDist(word_tokens_party)
176
+ # Get top 5 words, handle potential IndexError if less than 5 unique words
177
+ common_words = fdistance.most_common(6)
178
+ if len(common_words) < 5:
179
+ word_Lst = [word for word, _ in common_words]
180
+ else:
181
+ word_Lst = [common_words[x][0] for x in range(5)]
182
+
183
+ if not word_Lst:
184
+ return None
185
+
186
+ plt.figure(figsize=(10, 5)) # Adjust figure size
187
  plt.title('Dispersion Plot')
188
  moby.dispersion_plot(word_Lst)
189
  plt.tight_layout()
190
  buf = BytesIO()
191
+ plt.savefig(buf, format='png', bbox_inches='tight')
192
  buf.seek(0)
193
  img = Image.open(buf)
194
+ plt.close() # Close the figure
195
  return img
196
  except Exception as e:
197
  print(f"Dispersion plot error: {e}")
198
+ traceback.print_exc()
199
  return None
200
 
201
  def word_cloud_generator(parsed_text_name, text_Party):
202
  try:
203
+ # Handle case where parsed_text_name might not have .name
204
+ filename_lower = ""
205
+ if hasattr(parsed_text_name, 'name') and parsed_text_name.name:
206
+ filename_lower = parsed_text_name.name.lower()
207
+ elif isinstance(parsed_text_name, str):
208
+ filename_lower = parsed_text_name.lower()
209
+
210
+ mask_path = None
211
+ if 'bjp' in filename_lower:
212
  mask_path = 'bjpImg2.jpeg'
213
+ elif 'congress' in filename_lower:
214
  mask_path = 'congress3.jpeg'
215
+ elif 'aap' in filename_lower:
216
  mask_path = 'aapMain2.jpg'
217
+
218
+ # Generate word cloud
219
+ if text_Party.strip() == "":
220
+ raise ValueError("Text for word cloud is empty")
221
 
222
  if mask_path and os.path.exists(mask_path):
223
  orgImg = Image.open(mask_path)
224
+ # Ensure mask is in the right format (e.g., uint8)
225
+ if orgImg.mode != 'RGB':
226
+ orgImg = orgImg.convert('RGB')
227
  mask = np.array(orgImg)
228
+ wordcloud = WordCloud(max_words=3000, mask=mask, background_color='white').generate(text_Party) # Added background color
 
229
  else:
230
+ wordcloud = WordCloud(max_words=2000, background_color='white').generate(text_Party)
231
+
232
+ plt.figure(figsize=(8, 6)) # Set figure size
233
+ plt.imshow(wordcloud, interpolation='bilinear') # Use bilinear interpolation
234
  plt.axis("off")
235
+ plt.tight_layout()
236
  buf = BytesIO()
237
+ plt.savefig(buf, format='png', bbox_inches='tight')
238
  buf.seek(0)
239
+ img = Image.open(buf)
240
+ plt.close() # Close the figure
241
+ return img
242
  except Exception as e:
243
  print(f"Word cloud error: {e}")
244
+ traceback.print_exc()
245
+ return None # Return None on error
246
 
247
  def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin=10, right_margin=10, numLins=4):
248
  """
 
250
  """
251
  if not target_word or target_word.strip() == "":
252
  return "Please enter a search term"
 
253
  tokens = nltk.word_tokenize(tar_passage)
254
  text = nltk.Text(tokens)
255
  c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower())
256
  offsets = c.offsets(target_word)
257
+ if not offsets:
258
+ return f"Word '{target_word}' not found."
259
  concordance_txt = [
260
  text.tokens[max(0, offset - left_margin):offset + right_margin]
261
  for offset in offsets[:numLins]
262
  ]
 
263
  result = [' '.join(con_sub) for con_sub in concordance_txt]
264
+ return '\n'.join(result) # Use newline for better readability in textbox
265
 
266
  # --- Main Analysis Function ---
267
  def analysis(Manifesto, Search):
 
270
  return "No file uploaded", {}, None, None, None, None, None, "No file uploaded"
271
  if Search.strip() == "":
272
  Search = "government"
 
273
  raw_party = Parsing(Manifesto)
274
  if isinstance(raw_party, str) and raw_party.startswith("Error"):
275
  return raw_party, {}, None, None, None, None, None, "Parsing failed"
276
 
277
  text_Party = clean_text(raw_party)
278
  text_Party_processed = Preprocess(text_Party)
 
279
 
280
+ summary = generate_summary(raw_party) # Use raw_party for summary for more context?
 
 
 
 
281
 
282
+ # --- Sentiment Analysis ---
283
+ if not text_Party_processed.strip():
284
+ # Handle empty text after processing
285
+ df_dummy = pd.DataFrame({'Polarity_Label': ['Neutral'], 'Subjectivity_Label': ['Low']})
286
+ polarity_val = 0.0
287
+ subjectivity_val = 0.0
288
+ else:
289
+ polarity_val = TextBlob(text_Party_processed).sentiment.polarity
290
+ subjectivity_val = TextBlob(text_Party_processed).sentiment.subjectivity
291
+ polarity_label = 'Positive' if polarity_val > 0 else 'Negative' if polarity_val < 0 else 'Neutral'
292
+ subjectivity_label = 'High' if subjectivity_val > 0.5 else 'Low'
293
+ df_dummy = pd.DataFrame({'Polarity_Label': [polarity_label], 'Subjectivity_Label': [subjectivity_label]})
294
+
295
+ # --- Generate Plots with Safe Plotting ---
296
+ # Pass the potentially empty text and handle inside plotting functions
297
+ sentiment_plot = safe_plot(lambda: df_dummy['Polarity_Label'].value_counts().plot(kind='bar', color="#FF9F45", title='Sentiment Analysis'))
298
+ subjectivity_plot = safe_plot(lambda: df_dummy['Subjectivity_Label'].value_counts().plot(kind='bar', color="#B667F1", title='Subjectivity Analysis'))
299
  freq_plot = fDistancePlot(text_Party_processed)
300
  dispersion_plot = DispersionPlot(text_Party_processed)
301
+ wordcloud = word_cloud_generator(Manifesto, text_Party_processed) # Pass Manifesto object itself
302
 
303
  fdist_Party = fDistance(text_Party_processed)
304
  searChRes = get_all_phases_containing_tar_wrd(Search, text_Party_processed)
 
306
  return searChRes, fdist_Party, sentiment_plot, subjectivity_plot, wordcloud, freq_plot, dispersion_plot, summary
307
 
308
  except Exception as e:
309
+ error_msg = f"Critical error in analysis function: {str(e)}"
310
  print(error_msg)
311
  traceback.print_exc()
312
+ # Return error messages/images in the correct order
313
  return error_msg, {}, None, None, None, None, None, "Analysis failed"
314
 
 
 
 
315
 
316
+ # --- Gradio Interface ---
317
+ # Use Blocks for custom layout
318
  with gr.Blocks(title='Manifesto Analysis') as demo:
319
  gr.Markdown("# Manifesto Analysis")
320
+
321
+ # Input Section
322
  with gr.Row():
323
+ with gr.Column(scale=1): # Adjust scale if needed
324
+ file_input = gr.File(label="Upload Manifesto PDF", file_types=[".pdf"])
325
+ with gr.Column(scale=1):
326
+ search_input = gr.Textbox(label="Search Term", placeholder="Enter a term to search in the manifesto")
327
+ submit_btn = gr.Button("Analyze Manifesto", variant='primary') # Make button prominent
328
+
329
+ # Output Section using Tabs
330
  with gr.Tabs():
331
+ # --- Summary Tab ---
332
+ with gr.TabItem("Summary"):
333
+ summary_output = gr.Textbox(label='AI-Generated Summary', lines=10, interactive=False)
334
+
335
+ # --- Search Results Tab ---
336
+ with gr.TabItem("Search Results"):
337
+ search_output = gr.Textbox(label='Context Based Search Results', lines=10, interactive=False)
338
+
339
+ # --- Key Topics Tab ---
340
+ with gr.TabItem("Key Topics"):
341
+ topics_output = gr.Label(label="Most Relevant Topics (LLM Enhanced)", num_top_classes=10) # Show top 10
342
+
343
+ # --- Visualizations Tab ---
344
  with gr.TabItem("Visualizations"):
345
+ # Use Rows and Columns for better arrangement
346
+ with gr.Row(): # Row 1: Sentiment & Subjectivity
347
+ with gr.Column():
348
+ sentiment_output = gr.Image(label='Sentiment Analysis', interactive=False, height=400) # Set height
349
+ with gr.Column():
350
+ subjectivity_output = gr.Image(label='Subjectivity Analysis', interactive=False, height=400)
351
+
352
+ with gr.Row(): # Row 2: Word Cloud & Frequency
353
+ with gr.Column():
354
+ wordcloud_output = gr.Image(label='Word Cloud', interactive=False, height=400)
355
+ with gr.Column():
356
+ freq_output = gr.Image(label='Frequency Distribution', interactive=False, height=400)
357
 
358
+ with gr.Row(): # Row 3: Dispersion Plot (Full width)
359
+ with gr.Column():
360
+ dispersion_output = gr.Image(label='Dispersion Plot', interactive=False, height=400) # Adjust height as needed
361
+
362
+ # --- Link Button Click to Function and Outputs ---
363
+ # Ensure the order of outputs matches the function return order
364
  submit_btn.click(
365
  fn=analysis,
366
  inputs=[file_input, search_input],
367
  outputs=[
368
+ search_output, # 1
369
+ topics_output, # 2
370
+ sentiment_output, # 3
371
+ subjectivity_output, # 4
372
+ wordcloud_output, # 5
373
+ freq_output, # 6
374
+ dispersion_output, # 7
375
+ summary_output # 8
376
+ ],
377
+ concurrency_limit=1 # Limit concurrent analyses if needed
378
  )
379
 
380
+ # --- Examples ---
381
  gr.Examples(
382
  examples=[
383
  ["Example/AAP_Manifesto_2019.pdf", "government"],
384
  ["Example/Bjp_Manifesto_2019.pdf", "environment"],
385
  ["Example/Congress_Manifesto_2019.pdf", "safety"]
386
  ],
387
+ inputs=[file_input, search_input],
388
+ outputs=[search_output, topics_output, sentiment_output, subjectivity_output, wordcloud_output, freq_output, dispersion_output, summary_output], # Link examples to outputs
389
+ fn=analysis # Run analysis on example click
390
  )
391
 
392
+ # Launch the app
393
+ if __name__ == "__main__":
394
+ demo.launch(debug=True, share=False, show_error=True)
395
+
396
+ # import random
397
+ # import matplotlib.pyplot as plt
398
+ # import nltk
399
+ # from nltk.tokenize import word_tokenize, sent_tokenize
400
+ # from nltk.corpus import stopwords
401
+ # from nltk.stem import WordNetLemmatizer
402
+ # from nltk.text import Text
403
+ # from nltk.probability import FreqDist
404
+ # from cleantext import clean
405
+ # import textract
406
+ # import urllib.request
407
+ # from io import BytesIO
408
+ # import sys
409
+ # import pandas as pd
410
+ # import cv2
411
+ # import re
412
+ # from wordcloud import WordCloud, ImageColorGenerator
413
+ # from textblob import TextBlob
414
+ # from PIL import Image
415
+ # import os
416
+ # import gradio as gr
417
+ # from dotenv import load_dotenv
418
+ # import groq
419
+ # import json
420
+ # import traceback
421
+ # import numpy as np
422
+ # import unidecode
423
+ # import contractions
424
+ # from sklearn.feature_extraction.text import TfidfVectorizer
425
+
426
+
427
+ # # Load environment variables
428
+ # load_dotenv()
429
+
430
+ # # Download NLTK resources
431
+ # nltk.download(['stopwords', 'wordnet', 'words'])
432
+ # nltk.download('punkt')
433
+ # nltk.download('punkt_tab')
434
+ # # Initialize Groq client
435
+ # groq_api_key = os.getenv("GROQ_API_KEY")
436
+ # groq_client = groq.Groq(api_key=groq_api_key) if groq_api_key else None
437
+
438
+ # # Stopwords customization
439
+ # stop_words = set(stopwords.words('english'))
440
+ # stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2')
441
+
442
+ # # --- Parsing & Preprocessing Functions ---
443
+ # def Parsing(parsed_text):
444
+ # try:
445
+ # if hasattr(parsed_text, 'name'):
446
+ # file_path = parsed_text.name
447
+ # else:
448
+ # file_path = parsed_text
449
+ # raw_party = textract.process(file_path, encoding='ascii', method='pdfminer')
450
+ # return clean(raw_party)
451
+ # except Exception as e:
452
+ # print(f"Error parsing PDF: {e}")
453
+ # return f"Error parsing PDF: {e}"
454
+
455
+ # def clean_text(text):
456
+ # text = text.encode("ascii", errors="ignore").decode("ascii")
457
+ # text = unidecode.unidecode(text)
458
+ # text = contractions.fix(text)
459
+ # text = re.sub(r"\n", " ", text)
460
+ # text = re.sub(r"\t", " ", text)
461
+ # text = re.sub(r"/ ", " ", text)
462
+ # text = text.strip()
463
+ # text = re.sub(" +", " ", text).strip()
464
+ # text = [word for word in text.split() if word not in stop_words]
465
+ # return ' '.join(text)
466
+
467
+ # def Preprocess(textParty):
468
+ # text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty)
469
+ # pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
470
+ # text2Party = pattern.sub('', text1Party)
471
+ # return text2Party
472
+
473
+ # # --- Core Analysis Functions ---
474
+ # def generate_summary(text):
475
+ # if not groq_client:
476
+ # return "Summarization is not available. Please set up your GROQ_API_KEY in the .env file."
477
+ # if len(text) > 10000:
478
+ # text = text[:10000]
479
+ # try:
480
+ # completion = groq_client.chat.completions.create(
481
+ # model="llama3-8b-8192",
482
+ # messages=[
483
+ # {"role": "system", "content": "You are a helpful assistant that summarizes political manifestos. Provide a concise, objective summary that captures the key policy proposals, themes, and promises in the manifesto."},
484
+ # {"role": "user", "content": f"Please summarize the following political manifesto text in about 300-500 words, focusing on the main policy areas, promises, and themes:\n\n{text}"}
485
+ # ],
486
+ # temperature=0.3,
487
+ # max_tokens=800
488
+ # )
489
+ # return completion.choices[0].message.content
490
+ # except Exception as e:
491
+ # return f"Error generating summary: {str(e)}"
492
+
493
+ # def fDistance(text2Party):
494
+ # word_tokens_party = word_tokenize(text2Party)
495
+ # fdistance = FreqDist(word_tokens_party).most_common(10)
496
+ # mem = {x[0]: x[1] for x in fdistance}
497
+
498
+ # vectorizer = TfidfVectorizer(max_features=15, stop_words='english')
499
+ # tfidf_matrix = vectorizer.fit_transform(sent_tokenize(text2Party))
500
+ # feature_names = vectorizer.get_feature_names_out()
501
+
502
+ # tfidf_scores = {}
503
+ # for i, word in enumerate(feature_names):
504
+ # scores = [tfidf_matrix[j, i] for j in range(len(sent_tokenize(text2Party))) if i < tfidf_matrix[j].shape[1]]
505
+ # if scores:
506
+ # tfidf_scores[word] = sum(scores) / len(scores)
507
+
508
+ # combined_scores = {}
509
+ # for word in set(list(mem.keys()) + list(tfidf_scores.keys())):
510
+ # freq_score = mem.get(word, 0) / max(mem.values()) if mem else 0
511
+ # tfidf_score = tfidf_scores.get(word, 0) / max(tfidf_scores.values()) if tfidf_scores else 0
512
+ # combined_scores[word] = (freq_score * 0.3) + (tfidf_score * 0.7)
513
+
514
+ # top_words = dict(sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:10])
515
+ # return normalize(top_words)
516
+
517
+ # def normalize(d, target=1.0):
518
+ # raw = sum(d.values())
519
+ # factor = target / raw if raw != 0 else 0
520
+ # return {key: value * factor for key, value in d.items()}
521
+
522
+ # # --- Visualization Functions with Error Handling ---
523
+ # def safe_plot(func, *args, **kwargs):
524
+ # try:
525
+ # plt.clf()
526
+ # func(*args, **kwargs)
527
+ # buf = BytesIO()
528
+ # plt.savefig(buf, format='png')
529
+ # buf.seek(0)
530
+ # return Image.open(buf)
531
+ # except Exception as e:
532
+ # print(f"Plotting error: {e}")
533
+ # return None
534
+
535
+ # def fDistancePlot(text2Party):
536
+ # return safe_plot(lambda: FreqDist(word_tokenize(text2Party)).plot(15, title='Frequency Distribution'))
537
+
538
+ # def DispersionPlot(textParty):
539
+ # try:
540
+ # word_tokens_party = word_tokenize(textParty)
541
+ # moby = Text(word_tokens_party) # Ensure Text is imported
542
+ # fdistance = FreqDist(word_tokens_party)
543
+ # word_Lst = [fdistance.most_common(6)[x][0] for x in range(5)]
544
+ # plt.figure(figsize=(4, 3))
545
+ # plt.title('Dispersion Plot')
546
+ # moby.dispersion_plot(word_Lst)
547
+ # plt.tight_layout()
548
+ # buf = BytesIO()
549
+ # plt.savefig(buf, format='png')
550
+ # buf.seek(0)
551
+ # img = Image.open(buf)
552
+ # plt.clf()
553
+ # return img
554
+ # except Exception as e:
555
+ # print(f"Dispersion plot error: {e}")
556
+ # return None
557
+
558
+ # def word_cloud_generator(parsed_text_name, text_Party):
559
+ # try:
560
+ # parsed = parsed_text_name.lower()
561
+ # if 'bjp' in parsed:
562
+ # mask_path = 'bjpImg2.jpeg'
563
+ # elif 'congress' in parsed:
564
+ # mask_path = 'congress3.jpeg'
565
+ # elif 'aap' in parsed:
566
+ # mask_path = 'aapMain2.jpg'
567
+ # else:
568
+ # mask_path = None
569
+
570
+ # if mask_path and os.path.exists(mask_path):
571
+ # orgImg = Image.open(mask_path)
572
+ # mask = np.array(orgImg)
573
+ # wordcloud = WordCloud(max_words=3000, mask=mask).generate(text_Party)
574
+ # plt.imshow(wordcloud)
575
+ # else:
576
+ # wordcloud = WordCloud(max_words=2000).generate(text_Party)
577
+ # plt.imshow(wordcloud)
578
+ # plt.axis("off")
579
+ # buf = BytesIO()
580
+ # plt.savefig(buf, format='png')
581
+ # buf.seek(0)
582
+ # return Image.open(buf)
583
+ # except Exception as e:
584
+ # print(f"Word cloud error: {e}")
585
+ # return None
586
+
587
+ # def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin=10, right_margin=10, numLins=4):
588
+ # """
589
+ # Function to get all the phrases that contain the target word in a text/passage.
590
+ # """
591
+ # if not target_word or target_word.strip() == "":
592
+ # return "Please enter a search term"
593
+
594
+ # tokens = nltk.word_tokenize(tar_passage)
595
+ # text = nltk.Text(tokens)
596
+ # c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower())
597
+ # offsets = c.offsets(target_word)
598
+
599
+ # concordance_txt = [
600
+ # text.tokens[max(0, offset - left_margin):offset + right_margin]
601
+ # for offset in offsets[:numLins]
602
+ # ]
603
+
604
+ # result = [' '.join(con_sub) for con_sub in concordance_txt]
605
+ # return '\n'.join(result)
606
+
607
+ # # --- Main Analysis Function ---
608
+ # def analysis(Manifesto, Search):
609
+ # try:
610
+ # if Manifesto is None:
611
+ # return "No file uploaded", {}, None, None, None, None, None, "No file uploaded"
612
+ # if Search.strip() == "":
613
+ # Search = "government"
614
+
615
+ # raw_party = Parsing(Manifesto)
616
+ # if isinstance(raw_party, str) and raw_party.startswith("Error"):
617
+ # return raw_party, {}, None, None, None, None, None, "Parsing failed"
618
+
619
+ # text_Party = clean_text(raw_party)
620
+ # text_Party_processed = Preprocess(text_Party)
621
+ # summary = generate_summary(raw_party)
622
+
623
+ # df = pd.DataFrame([{'Content': text_Party_processed}], columns=['Content'])
624
+ # df['Subjectivity'] = df['Content'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
625
+ # df['Polarity'] = df['Content'].apply(lambda x: TextBlob(x).sentiment.polarity)
626
+ # df['Polarity_Label'] = df['Polarity'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')
627
+ # df['Subjectivity_Label'] = df['Subjectivity'].apply(lambda x: 'High' if x > 0.5 else 'Low')
628
+
629
+ # # Generate Plots with Safe Plotting
630
+ # sentiment_plot = safe_plot(lambda: df['Polarity_Label'].value_counts().plot(kind='bar', color="#FF9F45", title='Sentiment Analysis'))
631
+ # subjectivity_plot = safe_plot(lambda: df['Subjectivity_Label'].value_counts().plot(kind='bar', color="#B667F1", title='Subjectivity Analysis'))
632
+ # freq_plot = fDistancePlot(text_Party_processed)
633
+ # dispersion_plot = DispersionPlot(text_Party_processed)
634
+ # wordcloud = word_cloud_generator(Manifesto.name, text_Party_processed)
635
+
636
+ # fdist_Party = fDistance(text_Party_processed)
637
+ # searChRes = get_all_phases_containing_tar_wrd(Search, text_Party_processed)
638
+
639
+ # return searChRes, fdist_Party, sentiment_plot, subjectivity_plot, wordcloud, freq_plot, dispersion_plot, summary
640
+
641
+ # except Exception as e:
642
+ # error_msg = f"Critical error: {str(e)}"
643
+ # print(error_msg)
644
+ # traceback.print_exc()
645
+ # return error_msg, {}, None, None, None, None, None, "Analysis failed"
646
+
647
+ # # --- Gradio Interface ---
648
+ # Search_txt = "text"
649
+ # filePdf = "file"
650
+
651
+ # with gr.Blocks(title='Manifesto Analysis') as demo:
652
+ # gr.Markdown("# Manifesto Analysis")
653
+ # with gr.Row():
654
+ # with gr.Column():
655
+ # file_input = gr.File(label="Upload Manifesto PDF", file_types=[".pdf"])
656
+ # search_input = gr.Textbox(label="Search Term", placeholder="Enter a term to search in the manifesto")
657
+ # submit_btn = gr.Button("Analyze Manifesto")
658
+ # with gr.Tabs():
659
+ # with gr.TabItem("Summary"): gr.Textbox(label='LLM Based Summary', lines=10)
660
+ # with gr.TabItem("Search Results"): gr.Textbox(label='Context Based Search')
661
+ # with gr.TabItem("Key Topics"): gr.Label(label="Most Relevant Topics (LLM Enhanced)")
662
+ # with gr.TabItem("Visualizations"):
663
+ # with gr.Row():
664
+ # gr.Image(label='Sentiment Analysis'), gr.Image(label='Subjectivity Analysis')
665
+ # with gr.Row():
666
+ # gr.Image(label='Word Cloud'), gr.Image(label='Frequency Distribution')
667
+ # gr.Image(label='Dispersion Plot')
668
+
669
+ # submit_btn.click(
670
+ # fn=analysis,
671
+ # inputs=[file_input, search_input],
672
+ # outputs=[
673
+ # gr.Textbox(label='Context Based Search'),
674
+ # gr.Label(label="Most Relevant Topics (LLM Enhanced)"),
675
+ # gr.Image(label='Sentiment Analysis'),
676
+ # gr.Image(label='Subjectivity Analysis'),
677
+ # gr.Image(label='Word Cloud'),
678
+ # gr.Image(label='Frequency Distribution'),
679
+ # gr.Image(label='Dispersion Plot'),
680
+ # gr.Textbox(label='AI-Generated Summary', lines=10)
681
+ # ]
682
+ # )
683
+
684
+ # gr.Examples(
685
+ # examples=[
686
+ # ["Example/AAP_Manifesto_2019.pdf", "government"],
687
+ # ["Example/Bjp_Manifesto_2019.pdf", "environment"],
688
+ # ["Example/Congress_Manifesto_2019.pdf", "safety"]
689
+ # ],
690
+ # inputs=[file_input, search_input]
691
+ # )
692
+
693
+ # demo.launch(debug=True, share=False, show_error=True)