Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -28,13 +28,37 @@ import unidecode
|
|
28 |
import contractions
|
29 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
30 |
|
31 |
-
|
32 |
load_dotenv()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
# Download NLTK resources (Ensure this runs once or handle caching)
|
35 |
-
nltk.download(['stopwords', 'wordnet', 'words'])
|
36 |
-
nltk.download('punkt')
|
37 |
-
nltk.download('punkt_tab')
|
38 |
# Initialize Groq client
|
39 |
groq_api_key = os.getenv("GROQ_API_KEY")
|
40 |
groq_client = groq.Groq(api_key=groq_api_key) if groq_api_key else None
|
@@ -243,6 +267,7 @@ def word_cloud_generator(parsed_text_name, text_Party):
|
|
243 |
traceback.print_exc()
|
244 |
return None # Return None on error
|
245 |
|
|
|
246 |
def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin=10, right_margin=10, numLins=4):
|
247 |
"""
|
248 |
Function to get all the phrases that contain the target word in a text/passage.
|
@@ -262,20 +287,76 @@ def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin=10,
|
|
262 |
result = [' '.join(con_sub) for con_sub in concordance_txt]
|
263 |
return '\n'.join(result) # Use newline for better readability in textbox
|
264 |
|
265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
def analysis(Manifesto, Search):
|
267 |
try:
|
268 |
if Manifesto is None:
|
|
|
269 |
return "No file uploaded", {}, None, None, None, None, None, "No file uploaded"
|
270 |
if Search.strip() == "":
|
271 |
Search = "government"
|
272 |
raw_party = Parsing(Manifesto)
|
273 |
if isinstance(raw_party, str) and raw_party.startswith("Error"):
|
274 |
return raw_party, {}, None, None, None, None, None, "Parsing failed"
|
275 |
-
|
276 |
text_Party = clean_text(raw_party)
|
277 |
text_Party_processed = Preprocess(text_Party)
|
|
|
|
|
|
|
|
|
278 |
|
|
|
279 |
summary = generate_summary(raw_party) # Use raw_party for summary for more context?
|
280 |
|
281 |
# --- Sentiment Analysis ---
|
@@ -298,10 +379,10 @@ def analysis(Manifesto, Search):
|
|
298 |
freq_plot = fDistancePlot(text_Party_processed)
|
299 |
dispersion_plot = DispersionPlot(text_Party_processed)
|
300 |
wordcloud = word_cloud_generator(Manifesto, text_Party_processed) # Pass Manifesto object itself
|
301 |
-
|
302 |
fdist_Party = fDistance(text_Party_processed)
|
303 |
-
|
304 |
-
|
|
|
305 |
return searChRes, fdist_Party, sentiment_plot, subjectivity_plot, wordcloud, freq_plot, dispersion_plot, summary
|
306 |
|
307 |
except Exception as e:
|
@@ -312,11 +393,10 @@ def analysis(Manifesto, Search):
|
|
312 |
return error_msg, {}, None, None, None, None, None, "Analysis failed"
|
313 |
|
314 |
|
315 |
-
# --- Gradio Interface ---
|
316 |
# Use Blocks for custom layout
|
317 |
with gr.Blocks(title='Manifesto Analysis') as demo:
|
318 |
gr.Markdown("# Manifesto Analysis")
|
319 |
-
|
320 |
# Input Section
|
321 |
with gr.Row():
|
322 |
with gr.Column(scale=1): # Adjust scale if needed
|
@@ -333,7 +413,8 @@ with gr.Blocks(title='Manifesto Analysis') as demo:
|
|
333 |
|
334 |
# --- Search Results Tab ---
|
335 |
with gr.TabItem("Search Results"):
|
336 |
-
|
|
|
337 |
|
338 |
# --- Key Topics Tab ---
|
339 |
with gr.TabItem("Key Topics"):
|
@@ -364,7 +445,7 @@ with gr.Blocks(title='Manifesto Analysis') as demo:
|
|
364 |
fn=analysis,
|
365 |
inputs=[file_input, search_input],
|
366 |
outputs=[
|
367 |
-
search_output, # 1
|
368 |
topics_output, # 2
|
369 |
sentiment_output, # 3
|
370 |
subjectivity_output, # 4
|
@@ -392,301 +473,3 @@ with gr.Blocks(title='Manifesto Analysis') as demo:
|
|
392 |
if __name__ == "__main__":
|
393 |
demo.launch(debug=True, share=False, show_error=True)
|
394 |
|
395 |
-
# import random
|
396 |
-
# import matplotlib.pyplot as plt
|
397 |
-
# import nltk
|
398 |
-
# from nltk.tokenize import word_tokenize, sent_tokenize
|
399 |
-
# from nltk.corpus import stopwords
|
400 |
-
# from nltk.stem import WordNetLemmatizer
|
401 |
-
# from nltk.text import Text
|
402 |
-
# from nltk.probability import FreqDist
|
403 |
-
# from cleantext import clean
|
404 |
-
# import textract
|
405 |
-
# import urllib.request
|
406 |
-
# from io import BytesIO
|
407 |
-
# import sys
|
408 |
-
# import pandas as pd
|
409 |
-
# import cv2
|
410 |
-
# import re
|
411 |
-
# from wordcloud import WordCloud, ImageColorGenerator
|
412 |
-
# from textblob import TextBlob
|
413 |
-
# from PIL import Image
|
414 |
-
# import os
|
415 |
-
# import gradio as gr
|
416 |
-
# from dotenv import load_dotenv
|
417 |
-
# import groq
|
418 |
-
# import json
|
419 |
-
# import traceback
|
420 |
-
# import numpy as np
|
421 |
-
# import unidecode
|
422 |
-
# import contractions
|
423 |
-
# from sklearn.feature_extraction.text import TfidfVectorizer
|
424 |
-
|
425 |
-
|
426 |
-
# # Load environment variables
|
427 |
-
# load_dotenv()
|
428 |
-
|
429 |
-
# # Download NLTK resources
|
430 |
-
# nltk.download(['stopwords', 'wordnet', 'words'])
|
431 |
-
# nltk.download('punkt')
|
432 |
-
# nltk.download('punkt_tab')
|
433 |
-
# # Initialize Groq client
|
434 |
-
# groq_api_key = os.getenv("GROQ_API_KEY")
|
435 |
-
# groq_client = groq.Groq(api_key=groq_api_key) if groq_api_key else None
|
436 |
-
|
437 |
-
# # Stopwords customization
|
438 |
-
# stop_words = set(stopwords.words('english'))
|
439 |
-
# stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2')
|
440 |
-
|
441 |
-
# # --- Parsing & Preprocessing Functions ---
|
442 |
-
# def Parsing(parsed_text):
|
443 |
-
# try:
|
444 |
-
# if hasattr(parsed_text, 'name'):
|
445 |
-
# file_path = parsed_text.name
|
446 |
-
# else:
|
447 |
-
# file_path = parsed_text
|
448 |
-
# raw_party = textract.process(file_path, encoding='ascii', method='pdfminer')
|
449 |
-
# return clean(raw_party)
|
450 |
-
# except Exception as e:
|
451 |
-
# print(f"Error parsing PDF: {e}")
|
452 |
-
# return f"Error parsing PDF: {e}"
|
453 |
-
|
454 |
-
# def clean_text(text):
|
455 |
-
# text = text.encode("ascii", errors="ignore").decode("ascii")
|
456 |
-
# text = unidecode.unidecode(text)
|
457 |
-
# text = contractions.fix(text)
|
458 |
-
# text = re.sub(r"\n", " ", text)
|
459 |
-
# text = re.sub(r"\t", " ", text)
|
460 |
-
# text = re.sub(r"/ ", " ", text)
|
461 |
-
# text = text.strip()
|
462 |
-
# text = re.sub(" +", " ", text).strip()
|
463 |
-
# text = [word for word in text.split() if word not in stop_words]
|
464 |
-
# return ' '.join(text)
|
465 |
-
|
466 |
-
# def Preprocess(textParty):
|
467 |
-
# text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty)
|
468 |
-
# pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
|
469 |
-
# text2Party = pattern.sub('', text1Party)
|
470 |
-
# return text2Party
|
471 |
-
|
472 |
-
# # --- Core Analysis Functions ---
|
473 |
-
# def generate_summary(text):
|
474 |
-
# if not groq_client:
|
475 |
-
# return "Summarization is not available. Please set up your GROQ_API_KEY in the .env file."
|
476 |
-
# if len(text) > 10000:
|
477 |
-
# text = text[:10000]
|
478 |
-
# try:
|
479 |
-
# completion = groq_client.chat.completions.create(
|
480 |
-
# model="llama3-8b-8192",
|
481 |
-
# messages=[
|
482 |
-
# {"role": "system", "content": "You are a helpful assistant that summarizes political manifestos. Provide a concise, objective summary that captures the key policy proposals, themes, and promises in the manifesto."},
|
483 |
-
# {"role": "user", "content": f"Please summarize the following political manifesto text in about 300-500 words, focusing on the main policy areas, promises, and themes:\n\n{text}"}
|
484 |
-
# ],
|
485 |
-
# temperature=0.3,
|
486 |
-
# max_tokens=800
|
487 |
-
# )
|
488 |
-
# return completion.choices[0].message.content
|
489 |
-
# except Exception as e:
|
490 |
-
# return f"Error generating summary: {str(e)}"
|
491 |
-
|
492 |
-
# def fDistance(text2Party):
|
493 |
-
# word_tokens_party = word_tokenize(text2Party)
|
494 |
-
# fdistance = FreqDist(word_tokens_party).most_common(10)
|
495 |
-
# mem = {x[0]: x[1] for x in fdistance}
|
496 |
-
|
497 |
-
# vectorizer = TfidfVectorizer(max_features=15, stop_words='english')
|
498 |
-
# tfidf_matrix = vectorizer.fit_transform(sent_tokenize(text2Party))
|
499 |
-
# feature_names = vectorizer.get_feature_names_out()
|
500 |
-
|
501 |
-
# tfidf_scores = {}
|
502 |
-
# for i, word in enumerate(feature_names):
|
503 |
-
# scores = [tfidf_matrix[j, i] for j in range(len(sent_tokenize(text2Party))) if i < tfidf_matrix[j].shape[1]]
|
504 |
-
# if scores:
|
505 |
-
# tfidf_scores[word] = sum(scores) / len(scores)
|
506 |
-
|
507 |
-
# combined_scores = {}
|
508 |
-
# for word in set(list(mem.keys()) + list(tfidf_scores.keys())):
|
509 |
-
# freq_score = mem.get(word, 0) / max(mem.values()) if mem else 0
|
510 |
-
# tfidf_score = tfidf_scores.get(word, 0) / max(tfidf_scores.values()) if tfidf_scores else 0
|
511 |
-
# combined_scores[word] = (freq_score * 0.3) + (tfidf_score * 0.7)
|
512 |
-
|
513 |
-
# top_words = dict(sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:10])
|
514 |
-
# return normalize(top_words)
|
515 |
-
|
516 |
-
# def normalize(d, target=1.0):
|
517 |
-
# raw = sum(d.values())
|
518 |
-
# factor = target / raw if raw != 0 else 0
|
519 |
-
# return {key: value * factor for key, value in d.items()}
|
520 |
-
|
521 |
-
# # --- Visualization Functions with Error Handling ---
|
522 |
-
# def safe_plot(func, *args, **kwargs):
|
523 |
-
# try:
|
524 |
-
# plt.clf()
|
525 |
-
# func(*args, **kwargs)
|
526 |
-
# buf = BytesIO()
|
527 |
-
# plt.savefig(buf, format='png')
|
528 |
-
# buf.seek(0)
|
529 |
-
# return Image.open(buf)
|
530 |
-
# except Exception as e:
|
531 |
-
# print(f"Plotting error: {e}")
|
532 |
-
# return None
|
533 |
-
|
534 |
-
# def fDistancePlot(text2Party):
|
535 |
-
# return safe_plot(lambda: FreqDist(word_tokenize(text2Party)).plot(15, title='Frequency Distribution'))
|
536 |
-
|
537 |
-
# def DispersionPlot(textParty):
|
538 |
-
# try:
|
539 |
-
# word_tokens_party = word_tokenize(textParty)
|
540 |
-
# moby = Text(word_tokens_party) # Ensure Text is imported
|
541 |
-
# fdistance = FreqDist(word_tokens_party)
|
542 |
-
# word_Lst = [fdistance.most_common(6)[x][0] for x in range(5)]
|
543 |
-
# plt.figure(figsize=(4, 3))
|
544 |
-
# plt.title('Dispersion Plot')
|
545 |
-
# moby.dispersion_plot(word_Lst)
|
546 |
-
# plt.tight_layout()
|
547 |
-
# buf = BytesIO()
|
548 |
-
# plt.savefig(buf, format='png')
|
549 |
-
# buf.seek(0)
|
550 |
-
# img = Image.open(buf)
|
551 |
-
# plt.clf()
|
552 |
-
# return img
|
553 |
-
# except Exception as e:
|
554 |
-
# print(f"Dispersion plot error: {e}")
|
555 |
-
# return None
|
556 |
-
|
557 |
-
# def word_cloud_generator(parsed_text_name, text_Party):
|
558 |
-
# try:
|
559 |
-
# parsed = parsed_text_name.lower()
|
560 |
-
# if 'bjp' in parsed:
|
561 |
-
# mask_path = 'bjpImg2.jpeg'
|
562 |
-
# elif 'congress' in parsed:
|
563 |
-
# mask_path = 'congress3.jpeg'
|
564 |
-
# elif 'aap' in parsed:
|
565 |
-
# mask_path = 'aapMain2.jpg'
|
566 |
-
# else:
|
567 |
-
# mask_path = None
|
568 |
-
|
569 |
-
# if mask_path and os.path.exists(mask_path):
|
570 |
-
# orgImg = Image.open(mask_path)
|
571 |
-
# mask = np.array(orgImg)
|
572 |
-
# wordcloud = WordCloud(max_words=3000, mask=mask).generate(text_Party)
|
573 |
-
# plt.imshow(wordcloud)
|
574 |
-
# else:
|
575 |
-
# wordcloud = WordCloud(max_words=2000).generate(text_Party)
|
576 |
-
# plt.imshow(wordcloud)
|
577 |
-
# plt.axis("off")
|
578 |
-
# buf = BytesIO()
|
579 |
-
# plt.savefig(buf, format='png')
|
580 |
-
# buf.seek(0)
|
581 |
-
# return Image.open(buf)
|
582 |
-
# except Exception as e:
|
583 |
-
# print(f"Word cloud error: {e}")
|
584 |
-
# return None
|
585 |
-
|
586 |
-
# def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin=10, right_margin=10, numLins=4):
|
587 |
-
# """
|
588 |
-
# Function to get all the phrases that contain the target word in a text/passage.
|
589 |
-
# """
|
590 |
-
# if not target_word or target_word.strip() == "":
|
591 |
-
# return "Please enter a search term"
|
592 |
-
|
593 |
-
# tokens = nltk.word_tokenize(tar_passage)
|
594 |
-
# text = nltk.Text(tokens)
|
595 |
-
# c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower())
|
596 |
-
# offsets = c.offsets(target_word)
|
597 |
-
|
598 |
-
# concordance_txt = [
|
599 |
-
# text.tokens[max(0, offset - left_margin):offset + right_margin]
|
600 |
-
# for offset in offsets[:numLins]
|
601 |
-
# ]
|
602 |
-
|
603 |
-
# result = [' '.join(con_sub) for con_sub in concordance_txt]
|
604 |
-
# return '\n'.join(result)
|
605 |
-
|
606 |
-
# # --- Main Analysis Function ---
|
607 |
-
# def analysis(Manifesto, Search):
|
608 |
-
# try:
|
609 |
-
# if Manifesto is None:
|
610 |
-
# return "No file uploaded", {}, None, None, None, None, None, "No file uploaded"
|
611 |
-
# if Search.strip() == "":
|
612 |
-
# Search = "government"
|
613 |
-
|
614 |
-
# raw_party = Parsing(Manifesto)
|
615 |
-
# if isinstance(raw_party, str) and raw_party.startswith("Error"):
|
616 |
-
# return raw_party, {}, None, None, None, None, None, "Parsing failed"
|
617 |
-
|
618 |
-
# text_Party = clean_text(raw_party)
|
619 |
-
# text_Party_processed = Preprocess(text_Party)
|
620 |
-
# summary = generate_summary(raw_party)
|
621 |
-
|
622 |
-
# df = pd.DataFrame([{'Content': text_Party_processed}], columns=['Content'])
|
623 |
-
# df['Subjectivity'] = df['Content'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
|
624 |
-
# df['Polarity'] = df['Content'].apply(lambda x: TextBlob(x).sentiment.polarity)
|
625 |
-
# df['Polarity_Label'] = df['Polarity'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')
|
626 |
-
# df['Subjectivity_Label'] = df['Subjectivity'].apply(lambda x: 'High' if x > 0.5 else 'Low')
|
627 |
-
|
628 |
-
# # Generate Plots with Safe Plotting
|
629 |
-
# sentiment_plot = safe_plot(lambda: df['Polarity_Label'].value_counts().plot(kind='bar', color="#FF9F45", title='Sentiment Analysis'))
|
630 |
-
# subjectivity_plot = safe_plot(lambda: df['Subjectivity_Label'].value_counts().plot(kind='bar', color="#B667F1", title='Subjectivity Analysis'))
|
631 |
-
# freq_plot = fDistancePlot(text_Party_processed)
|
632 |
-
# dispersion_plot = DispersionPlot(text_Party_processed)
|
633 |
-
# wordcloud = word_cloud_generator(Manifesto.name, text_Party_processed)
|
634 |
-
|
635 |
-
# fdist_Party = fDistance(text_Party_processed)
|
636 |
-
# searChRes = get_all_phases_containing_tar_wrd(Search, text_Party_processed)
|
637 |
-
|
638 |
-
# return searChRes, fdist_Party, sentiment_plot, subjectivity_plot, wordcloud, freq_plot, dispersion_plot, summary
|
639 |
-
|
640 |
-
# except Exception as e:
|
641 |
-
# error_msg = f"Critical error: {str(e)}"
|
642 |
-
# print(error_msg)
|
643 |
-
# traceback.print_exc()
|
644 |
-
# return error_msg, {}, None, None, None, None, None, "Analysis failed"
|
645 |
-
|
646 |
-
# # --- Gradio Interface ---
|
647 |
-
# Search_txt = "text"
|
648 |
-
# filePdf = "file"
|
649 |
-
|
650 |
-
# with gr.Blocks(title='Manifesto Analysis') as demo:
|
651 |
-
# gr.Markdown("# Manifesto Analysis")
|
652 |
-
# with gr.Row():
|
653 |
-
# with gr.Column():
|
654 |
-
# file_input = gr.File(label="Upload Manifesto PDF", file_types=[".pdf"])
|
655 |
-
# search_input = gr.Textbox(label="Search Term", placeholder="Enter a term to search in the manifesto")
|
656 |
-
# submit_btn = gr.Button("Analyze Manifesto")
|
657 |
-
# with gr.Tabs():
|
658 |
-
# with gr.TabItem("Summary"): gr.Textbox(label='LLM Based Summary', lines=10)
|
659 |
-
# with gr.TabItem("Search Results"): gr.Textbox(label='Context Based Search')
|
660 |
-
# with gr.TabItem("Key Topics"): gr.Label(label="Most Relevant Topics (LLM Enhanced)")
|
661 |
-
# with gr.TabItem("Visualizations"):
|
662 |
-
# with gr.Row():
|
663 |
-
# gr.Image(label='Sentiment Analysis'), gr.Image(label='Subjectivity Analysis')
|
664 |
-
# with gr.Row():
|
665 |
-
# gr.Image(label='Word Cloud'), gr.Image(label='Frequency Distribution')
|
666 |
-
# gr.Image(label='Dispersion Plot')
|
667 |
-
|
668 |
-
# submit_btn.click(
|
669 |
-
# fn=analysis,
|
670 |
-
# inputs=[file_input, search_input],
|
671 |
-
# outputs=[
|
672 |
-
# gr.Textbox(label='Context Based Search'),
|
673 |
-
# gr.Label(label="Most Relevant Topics (LLM Enhanced)"),
|
674 |
-
# gr.Image(label='Sentiment Analysis'),
|
675 |
-
# gr.Image(label='Subjectivity Analysis'),
|
676 |
-
# gr.Image(label='Word Cloud'),
|
677 |
-
# gr.Image(label='Frequency Distribution'),
|
678 |
-
# gr.Image(label='Dispersion Plot'),
|
679 |
-
# gr.Textbox(label='AI-Generated Summary', lines=10)
|
680 |
-
# ]
|
681 |
-
# )
|
682 |
-
|
683 |
-
# gr.Examples(
|
684 |
-
# examples=[
|
685 |
-
# ["Example/AAP_Manifesto_2019.pdf", "government"],
|
686 |
-
# ["Example/Bjp_Manifesto_2019.pdf", "environment"],
|
687 |
-
# ["Example/Congress_Manifesto_2019.pdf", "safety"]
|
688 |
-
# ],
|
689 |
-
# inputs=[file_input, search_input]
|
690 |
-
# )
|
691 |
-
|
692 |
-
# demo.launch(debug=True, share=False, show_error=True)
|
|
|
28 |
import contractions
|
29 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
30 |
|
31 |
+
|
32 |
load_dotenv()
|
33 |
+
import nltk
|
34 |
+
import ssl
|
35 |
+
|
36 |
+
def ensure_nltk_resources():
|
37 |
+
try:
|
38 |
+
nltk.data.find('tokenizers/punkt')
|
39 |
+
nltk.data.find('corpora/stopwords')
|
40 |
+
except LookupError:
|
41 |
+
print("NLTK resources not found. Downloading...")
|
42 |
+
try:
|
43 |
+
# Handling potential SSL issues (common on some systems)
|
44 |
+
_create_unverified_https_context = ssl._create_unverified_context
|
45 |
+
except AttributeError:
|
46 |
+
pass
|
47 |
+
else:
|
48 |
+
ssl._create_default_https_context = _create_unverified_https_context
|
49 |
+
|
50 |
+
nltk.download(['stopwords', 'wordnet', 'words'])
|
51 |
+
nltk.download('punkt')
|
52 |
+
nltk.download('punkt_tab')
|
53 |
+
print("NLTK resources downloaded successfully.")
|
54 |
+
|
55 |
+
|
56 |
+
ensure_nltk_resources()
|
57 |
|
58 |
# Download NLTK resources (Ensure this runs once or handle caching)
|
59 |
+
# nltk.download(['stopwords', 'wordnet', 'words'])
|
60 |
+
# nltk.download('punkt')
|
61 |
+
# nltk.download('punkt_tab')
|
62 |
# Initialize Groq client
|
63 |
groq_api_key = os.getenv("GROQ_API_KEY")
|
64 |
groq_client = groq.Groq(api_key=groq_api_key) if groq_api_key else None
|
|
|
267 |
traceback.print_exc()
|
268 |
return None # Return None on error
|
269 |
|
270 |
+
# Initial design for concordance based search
|
271 |
def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin=10, right_margin=10, numLins=4):
|
272 |
"""
|
273 |
Function to get all the phrases that contain the target word in a text/passage.
|
|
|
287 |
result = [' '.join(con_sub) for con_sub in concordance_txt]
|
288 |
return '\n'.join(result) # Use newline for better readability in textbox
|
289 |
|
290 |
+
|
291 |
+
def get_contextual_search_result(target_word, tar_passage, groq_client_instance, max_context_length=8000):
|
292 |
+
"""
|
293 |
+
Uses the LLM to provide contextual information about the target word within the passage.
|
294 |
+
"""
|
295 |
+
if not target_word or target_word.strip() == "":
|
296 |
+
return "Please enter a search term."
|
297 |
+
|
298 |
+
if not groq_client_instance:
|
299 |
+
return "Contextual search requires the LLM API. Please set up your GROQ_API_KEY."
|
300 |
+
|
301 |
+
# Basic check if word exists (optional, LLM can handle it too)
|
302 |
+
if target_word.lower() not in tar_passage.lower():
|
303 |
+
return f"The term '{target_word}' was not found in the manifesto text."
|
304 |
+
|
305 |
+
# Truncate passage if too long for the model/context window
|
306 |
+
# You might need to adjust this based on your model's limits and desired performance
|
307 |
+
if len(tar_passage) > max_context_length:
|
308 |
+
# Simple truncation; could be improved to ensure sentences are complete
|
309 |
+
tar_passage = tar_passage[:max_context_length]
|
310 |
+
print(f"Warning: Passage truncated for LLM search context to {max_context_length} characters.")
|
311 |
+
|
312 |
+
prompt = f"""
|
313 |
+
You are given a political manifesto text and a specific search term.
|
314 |
+
Your task is to find all relevant mentions of the search term in the text and provide a concise, informative summary of the context surrounding each mention.
|
315 |
+
Focus on the key ideas, policies, or points related to the search term.
|
316 |
+
If the term is not found or not relevant, state that clearly.
|
317 |
+
Search Term: {target_word}
|
318 |
+
Manifesto Text:
|
319 |
+
{tar_passage}
|
320 |
+
"""
|
321 |
+
|
322 |
+
try:
|
323 |
+
completion = groq_client_instance.chat.completions.create(
|
324 |
+
model="llama3-8b-8192", # Use the same or a suitable model
|
325 |
+
messages=[
|
326 |
+
{"role": "system", "content": "You are a helpful assistant skilled at analyzing political texts and extracting relevant information based on a search query."},
|
327 |
+
{"role": "user", "content": prompt}
|
328 |
+
],
|
329 |
+
temperature=0.2, # Low temperature for more factual extraction
|
330 |
+
max_tokens=1000 # Adjust based on expected output length
|
331 |
+
)
|
332 |
+
result = completion.choices[0].message.content.strip()
|
333 |
+
return result if result else f"No specific context for '{target_word}' could be generated."
|
334 |
+
except Exception as e:
|
335 |
+
error_msg = f"Error during contextual search for '{target_word}': {str(e)}"
|
336 |
+
print(error_msg)
|
337 |
+
traceback.print_exc()
|
338 |
+
# Fallback to concordance if LLM fails?
|
339 |
+
# return get_all_phases_containing_tar_wrd_fallback(target_word, tar_passage)
|
340 |
+
return error_msg # Or return the error message directly
|
341 |
+
|
342 |
def analysis(Manifesto, Search):
|
343 |
try:
|
344 |
if Manifesto is None:
|
345 |
+
# Ensure return order matches the outputs list
|
346 |
return "No file uploaded", {}, None, None, None, None, None, "No file uploaded"
|
347 |
if Search.strip() == "":
|
348 |
Search = "government"
|
349 |
raw_party = Parsing(Manifesto)
|
350 |
if isinstance(raw_party, str) and raw_party.startswith("Error"):
|
351 |
return raw_party, {}, None, None, None, None, None, "Parsing failed"
|
|
|
352 |
text_Party = clean_text(raw_party)
|
353 |
text_Party_processed = Preprocess(text_Party)
|
354 |
+
|
355 |
+
# --- Perform Search FIRST using the ORIGINAL text for better context ---
|
356 |
+
# Pass the original raw text for richer context to the LLM
|
357 |
+
searChRes = get_contextual_search_result(Search, raw_party, groq_client)
|
358 |
|
359 |
+
# --- Then proceed with other analyses ---
|
360 |
summary = generate_summary(raw_party) # Use raw_party for summary for more context?
|
361 |
|
362 |
# --- Sentiment Analysis ---
|
|
|
379 |
freq_plot = fDistancePlot(text_Party_processed)
|
380 |
dispersion_plot = DispersionPlot(text_Party_processed)
|
381 |
wordcloud = word_cloud_generator(Manifesto, text_Party_processed) # Pass Manifesto object itself
|
|
|
382 |
fdist_Party = fDistance(text_Party_processed)
|
383 |
+
|
384 |
+
# searChRes is now generated earlier
|
385 |
+
|
386 |
return searChRes, fdist_Party, sentiment_plot, subjectivity_plot, wordcloud, freq_plot, dispersion_plot, summary
|
387 |
|
388 |
except Exception as e:
|
|
|
393 |
return error_msg, {}, None, None, None, None, None, "Analysis failed"
|
394 |
|
395 |
|
396 |
+
# --- Gradio Interface (remains largely the same, just ensuring output variable names match) ---
|
397 |
# Use Blocks for custom layout
|
398 |
with gr.Blocks(title='Manifesto Analysis') as demo:
|
399 |
gr.Markdown("# Manifesto Analysis")
|
|
|
400 |
# Input Section
|
401 |
with gr.Row():
|
402 |
with gr.Column(scale=1): # Adjust scale if needed
|
|
|
413 |
|
414 |
# --- Search Results Tab ---
|
415 |
with gr.TabItem("Search Results"):
|
416 |
+
# Use the specific output variable defined in the layout
|
417 |
+
search_output = gr.Textbox(label='Context Based Search Results', lines=15, interactive=False, max_lines=20) # Increased lines/max_lines
|
418 |
|
419 |
# --- Key Topics Tab ---
|
420 |
with gr.TabItem("Key Topics"):
|
|
|
445 |
fn=analysis,
|
446 |
inputs=[file_input, search_input],
|
447 |
outputs=[
|
448 |
+
search_output, # 1 (Now contextual)
|
449 |
topics_output, # 2
|
450 |
sentiment_output, # 3
|
451 |
subjectivity_output, # 4
|
|
|
473 |
if __name__ == "__main__":
|
474 |
demo.launch(debug=True, share=False, show_error=True)
|
475 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|