Spaces:

Sa-m
/

manifesto-explainer

Running

App Files Files Community

Sa-m commited on 24 days ago

Commit

9f93f0e

verified ·

1 Parent(s): 62f1d2a

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -311

app.py CHANGED Viewed

@@ -28,13 +28,37 @@ import unidecode
 import contractions
 from sklearn.feature_extraction.text import TfidfVectorizer
-# Load environment variables
 load_dotenv()
 # Download NLTK resources (Ensure this runs once or handle caching)
-nltk.download(['stopwords', 'wordnet', 'words'])
-nltk.download('punkt')
-nltk.download('punkt_tab')
 # Initialize Groq client
 groq_api_key = os.getenv("GROQ_API_KEY")
 groq_client = groq.Groq(api_key=groq_api_key) if groq_api_key else None
@@ -243,6 +267,7 @@ def word_cloud_generator(parsed_text_name, text_Party):
         traceback.print_exc()
         return None # Return None on error
 def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin=10, right_margin=10, numLins=4):
     """
     Function to get all the phrases that contain the target word in a text/passage.
@@ -262,20 +287,76 @@ def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin=10,
     result = [' '.join(con_sub) for con_sub in concordance_txt]
     return '\n'.join(result) # Use newline for better readability in textbox
-# --- Main Analysis Function ---
 def analysis(Manifesto, Search):
     try:
         if Manifesto is None:
             return "No file uploaded", {}, None, None, None, None, None, "No file uploaded"
         if Search.strip() == "":
             Search = "government"
         raw_party = Parsing(Manifesto)
         if isinstance(raw_party, str) and raw_party.startswith("Error"):
             return raw_party, {}, None, None, None, None, None, "Parsing failed"
         text_Party = clean_text(raw_party)
         text_Party_processed = Preprocess(text_Party)
         summary = generate_summary(raw_party) # Use raw_party for summary for more context?
         # --- Sentiment Analysis ---
@@ -298,10 +379,10 @@ def analysis(Manifesto, Search):
         freq_plot = fDistancePlot(text_Party_processed)
         dispersion_plot = DispersionPlot(text_Party_processed)
         wordcloud = word_cloud_generator(Manifesto, text_Party_processed) # Pass Manifesto object itself
         fdist_Party = fDistance(text_Party_processed)
-        searChRes = get_all_phases_containing_tar_wrd(Search, text_Party_processed)
         return searChRes, fdist_Party, sentiment_plot, subjectivity_plot, wordcloud, freq_plot, dispersion_plot, summary
     except Exception as e:
@@ -312,11 +393,10 @@ def analysis(Manifesto, Search):
         return error_msg, {}, None, None, None, None, None, "Analysis failed"
-# --- Gradio Interface ---
 # Use Blocks for custom layout
 with gr.Blocks(title='Manifesto Analysis') as demo:
     gr.Markdown("# Manifesto Analysis")
     # Input Section
     with gr.Row():
         with gr.Column(scale=1): # Adjust scale if needed
@@ -333,7 +413,8 @@ with gr.Blocks(title='Manifesto Analysis') as demo:
         # --- Search Results Tab ---
         with gr.TabItem("Search Results"):
-            search_output = gr.Textbox(label='Context Based Search Results', lines=10, interactive=False)
         # --- Key Topics Tab ---
         with gr.TabItem("Key Topics"):
@@ -364,7 +445,7 @@ with gr.Blocks(title='Manifesto Analysis') as demo:
         fn=analysis,
         inputs=[file_input, search_input],
         outputs=[
-            search_output,        # 1
             topics_output,        # 2
             sentiment_output,     # 3
             subjectivity_output,  # 4
@@ -392,301 +473,3 @@ with gr.Blocks(title='Manifesto Analysis') as demo:
 if __name__ == "__main__":
     demo.launch(debug=True, share=False, show_error=True)
-# import random
-# import matplotlib.pyplot as plt
-# import nltk
-# from nltk.tokenize import word_tokenize, sent_tokenize
-# from nltk.corpus import stopwords
-# from nltk.stem import WordNetLemmatizer
-# from nltk.text import Text
-# from nltk.probability import FreqDist
-# from cleantext import clean
-# import textract
-# import urllib.request
-# from io import BytesIO
-# import sys
-# import pandas as pd
-# import cv2
-# import re
-# from wordcloud import WordCloud, ImageColorGenerator
-# from textblob import TextBlob
-# from PIL import Image
-# import os
-# import gradio as gr
-# from dotenv import load_dotenv
-# import groq
-# import json
-# import traceback
-# import numpy as np
-# import unidecode
-# import contractions
-# from sklearn.feature_extraction.text import TfidfVectorizer
-# # Load environment variables
-# load_dotenv()
-# # Download NLTK resources
-# nltk.download(['stopwords', 'wordnet', 'words'])
-# nltk.download('punkt')
-# nltk.download('punkt_tab')
-# # Initialize Groq client
-# groq_api_key = os.getenv("GROQ_API_KEY")
-# groq_client = groq.Groq(api_key=groq_api_key) if groq_api_key else None
-# # Stopwords customization
-# stop_words = set(stopwords.words('english'))
-# stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2')
-# # --- Parsing & Preprocessing Functions ---
-# def Parsing(parsed_text):
-#     try:
-#         if hasattr(parsed_text, 'name'):
-#             file_path = parsed_text.name
-#         else:
-#             file_path = parsed_text
-#         raw_party = textract.process(file_path, encoding='ascii', method='pdfminer')
-#         return clean(raw_party)
-#     except Exception as e:
-#         print(f"Error parsing PDF: {e}")
-#         return f"Error parsing PDF: {e}"
-# def clean_text(text):
-#     text = text.encode("ascii", errors="ignore").decode("ascii")
-#     text = unidecode.unidecode(text)
-#     text = contractions.fix(text)
-#     text = re.sub(r"\n", " ", text)
-#     text = re.sub(r"\t", " ", text)
-#     text = re.sub(r"/ ", " ", text)
-#     text = text.strip()
-#     text = re.sub(" +", " ", text).strip()
-#     text = [word for word in text.split() if word not in stop_words]
-#     return ' '.join(text)
-# def Preprocess(textParty):
-#     text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty)
-#     pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
-#     text2Party = pattern.sub('', text1Party)
-#     return text2Party
-# # --- Core Analysis Functions ---
-# def generate_summary(text):
-#     if not groq_client:
-#         return "Summarization is not available. Please set up your GROQ_API_KEY in the .env file."
-#     if len(text) > 10000:
-#         text = text[:10000]
-#     try:
-#         completion = groq_client.chat.completions.create(
-#             model="llama3-8b-8192",
-#             messages=[
-#                 {"role": "system", "content": "You are a helpful assistant that summarizes political manifestos. Provide a concise, objective summary that captures the key policy proposals, themes, and promises in the manifesto."},
-#                 {"role": "user", "content": f"Please summarize the following political manifesto text in about 300-500 words, focusing on the main policy areas, promises, and themes:\n\n{text}"}
-#             ],
-#             temperature=0.3,
-#             max_tokens=800
-#         )
-#         return completion.choices[0].message.content
-#     except Exception as e:
-#         return f"Error generating summary: {str(e)}"
-# def fDistance(text2Party):
-#     word_tokens_party = word_tokenize(text2Party)
-#     fdistance = FreqDist(word_tokens_party).most_common(10)
-#     mem = {x[0]: x[1] for x in fdistance}
-#     vectorizer = TfidfVectorizer(max_features=15, stop_words='english')
-#     tfidf_matrix = vectorizer.fit_transform(sent_tokenize(text2Party))
-#     feature_names = vectorizer.get_feature_names_out()
-#     tfidf_scores = {}
-#     for i, word in enumerate(feature_names):
-#         scores = [tfidf_matrix[j, i] for j in range(len(sent_tokenize(text2Party))) if i < tfidf_matrix[j].shape[1]]
-#         if scores:
-#             tfidf_scores[word] = sum(scores) / len(scores)
-#     combined_scores = {}
-#     for word in set(list(mem.keys()) + list(tfidf_scores.keys())):
-#         freq_score = mem.get(word, 0) / max(mem.values()) if mem else 0
-#         tfidf_score = tfidf_scores.get(word, 0) / max(tfidf_scores.values()) if tfidf_scores else 0
-#         combined_scores[word] = (freq_score * 0.3) + (tfidf_score * 0.7)
-#     top_words = dict(sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:10])
-#     return normalize(top_words)
-# def normalize(d, target=1.0):
-#     raw = sum(d.values())
-#     factor = target / raw if raw != 0 else 0
-#     return {key: value * factor for key, value in d.items()}
-# # --- Visualization Functions with Error Handling ---
-# def safe_plot(func, *args, **kwargs):
-#     try:
-#         plt.clf()
-#         func(*args, **kwargs)
-#         buf = BytesIO()
-#         plt.savefig(buf, format='png')
-#         buf.seek(0)
-#         return Image.open(buf)
-#     except Exception as e:
-#         print(f"Plotting error: {e}")
-#         return None
-# def fDistancePlot(text2Party):
-#     return safe_plot(lambda: FreqDist(word_tokenize(text2Party)).plot(15, title='Frequency Distribution'))
-# def DispersionPlot(textParty):
-#     try:
-#         word_tokens_party = word_tokenize(textParty)
-#         moby = Text(word_tokens_party)  # Ensure Text is imported
-#         fdistance = FreqDist(word_tokens_party)
-#         word_Lst = [fdistance.most_common(6)[x][0] for x in range(5)]
-#         plt.figure(figsize=(4, 3))
-#         plt.title('Dispersion Plot')
-#         moby.dispersion_plot(word_Lst)
-#         plt.tight_layout()
-#         buf = BytesIO()
-#         plt.savefig(buf, format='png')
-#         buf.seek(0)
-#         img = Image.open(buf)
-#         plt.clf()
-#         return img
-#     except Exception as e:
-#         print(f"Dispersion plot error: {e}")
-#         return None
-# def word_cloud_generator(parsed_text_name, text_Party):
-#     try:
-#         parsed = parsed_text_name.lower()
-#         if 'bjp' in parsed:
-#             mask_path = 'bjpImg2.jpeg'
-#         elif 'congress' in parsed:
-#             mask_path = 'congress3.jpeg'
-#         elif 'aap' in parsed:
-#             mask_path = 'aapMain2.jpg'
-#         else:
-#             mask_path = None
-#         if mask_path and os.path.exists(mask_path):
-#             orgImg = Image.open(mask_path)
-#             mask = np.array(orgImg)
-#             wordcloud = WordCloud(max_words=3000, mask=mask).generate(text_Party)
-#             plt.imshow(wordcloud)
-#         else:
-#             wordcloud = WordCloud(max_words=2000).generate(text_Party)
-#             plt.imshow(wordcloud)
-#         plt.axis("off")
-#         buf = BytesIO()
-#         plt.savefig(buf, format='png')
-#         buf.seek(0)
-#         return Image.open(buf)
-#     except Exception as e:
-#         print(f"Word cloud error: {e}")
-#         return None
-# def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin=10, right_margin=10, numLins=4):
-#     """
-#     Function to get all the phrases that contain the target word in a text/passage.
-#     """
-#     if not target_word or target_word.strip() == "":
-#         return "Please enter a search term"
-#     tokens = nltk.word_tokenize(tar_passage)
-#     text = nltk.Text(tokens)
-#     c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower())
-#     offsets = c.offsets(target_word)
-#     concordance_txt = [
-#         text.tokens[max(0, offset - left_margin):offset + right_margin]
-#         for offset in offsets[:numLins]
-#     ]
-#     result = [' '.join(con_sub) for con_sub in concordance_txt]
-#     return '\n'.join(result)
-# # --- Main Analysis Function ---
-# def analysis(Manifesto, Search):
-#     try:
-#         if Manifesto is None:
-#             return "No file uploaded", {}, None, None, None, None, None, "No file uploaded"
-#         if Search.strip() == "":
-#             Search = "government"
-#         raw_party = Parsing(Manifesto)
-#         if isinstance(raw_party, str) and raw_party.startswith("Error"):
-#             return raw_party, {}, None, None, None, None, None, "Parsing failed"
-#         text_Party = clean_text(raw_party)
-#         text_Party_processed = Preprocess(text_Party)
-#         summary = generate_summary(raw_party)
-#         df = pd.DataFrame([{'Content': text_Party_processed}], columns=['Content'])
-#         df['Subjectivity'] = df['Content'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
-#         df['Polarity'] = df['Content'].apply(lambda x: TextBlob(x).sentiment.polarity)
-#         df['Polarity_Label'] = df['Polarity'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')
-#         df['Subjectivity_Label'] = df['Subjectivity'].apply(lambda x: 'High' if x > 0.5 else 'Low')
-#         # Generate Plots with Safe Plotting
-#         sentiment_plot = safe_plot(lambda: df['Polarity_Label'].value_counts().plot(kind='bar', color="#FF9F45", title='Sentiment Analysis'))
-#         subjectivity_plot = safe_plot(lambda: df['Subjectivity_Label'].value_counts().plot(kind='bar', color="#B667F1", title='Subjectivity Analysis'))
-#         freq_plot = fDistancePlot(text_Party_processed)
-#         dispersion_plot = DispersionPlot(text_Party_processed)
-#         wordcloud = word_cloud_generator(Manifesto.name, text_Party_processed)
-#         fdist_Party = fDistance(text_Party_processed)
-#         searChRes = get_all_phases_containing_tar_wrd(Search, text_Party_processed)
-#         return searChRes, fdist_Party, sentiment_plot, subjectivity_plot, wordcloud, freq_plot, dispersion_plot, summary
-#     except Exception as e:
-#         error_msg = f"Critical error: {str(e)}"
-#         print(error_msg)
-#         traceback.print_exc()
-#         return error_msg, {}, None, None, None, None, None, "Analysis failed"
-# # --- Gradio Interface ---
-# Search_txt = "text"
-# filePdf = "file"
-# with gr.Blocks(title='Manifesto Analysis') as demo:
-#     gr.Markdown("# Manifesto Analysis")
-#     with gr.Row():
-#         with gr.Column():
-#             file_input = gr.File(label="Upload Manifesto PDF", file_types=[".pdf"])
-#             search_input = gr.Textbox(label="Search Term", placeholder="Enter a term to search in the manifesto")
-#             submit_btn = gr.Button("Analyze Manifesto")
-#     with gr.Tabs():
-#         with gr.TabItem("Summary"): gr.Textbox(label='LLM Based Summary', lines=10)
-#         with gr.TabItem("Search Results"): gr.Textbox(label='Context Based Search')
-#         with gr.TabItem("Key Topics"): gr.Label(label="Most Relevant Topics (LLM Enhanced)")
-#         with gr.TabItem("Visualizations"):
-#             with gr.Row():
-#                 gr.Image(label='Sentiment Analysis'), gr.Image(label='Subjectivity Analysis')
-#             with gr.Row():
-#                 gr.Image(label='Word Cloud'), gr.Image(label='Frequency Distribution')
-#             gr.Image(label='Dispersion Plot')
-#     submit_btn.click(
-#         fn=analysis,
-#         inputs=[file_input, search_input],
-#         outputs=[
-#             gr.Textbox(label='Context Based Search'),
-#             gr.Label(label="Most Relevant Topics (LLM Enhanced)"),
-#             gr.Image(label='Sentiment Analysis'),
-#             gr.Image(label='Subjectivity Analysis'),
-#             gr.Image(label='Word Cloud'),
-#             gr.Image(label='Frequency Distribution'),
-#             gr.Image(label='Dispersion Plot'),
-#             gr.Textbox(label='AI-Generated Summary', lines=10)
-#         ]
-#     )
-#     gr.Examples(
-#         examples=[
-#             ["Example/AAP_Manifesto_2019.pdf", "government"],
-#             ["Example/Bjp_Manifesto_2019.pdf", "environment"],
-#             ["Example/Congress_Manifesto_2019.pdf", "safety"]
-#         ],
-#         inputs=[file_input, search_input]
-#     )
-# demo.launch(debug=True, share=False, show_error=True)

 import contractions
 from sklearn.feature_extraction.text import TfidfVectorizer
 load_dotenv()
+import nltk
+import ssl
+def ensure_nltk_resources():
+    try:
+        nltk.data.find('tokenizers/punkt')
+        nltk.data.find('corpora/stopwords')
+    except LookupError:
+        print("NLTK resources not found. Downloading...")
+        try:
+            # Handling potential SSL issues (common on some systems)
+            _create_unverified_https_context = ssl._create_unverified_context
+        except AttributeError:
+            pass
+        else:
+            ssl._create_default_https_context = _create_unverified_https_context
+        nltk.download(['stopwords', 'wordnet', 'words'])
+        nltk.download('punkt')
+        nltk.download('punkt_tab')
+        print("NLTK resources downloaded successfully.")
+ensure_nltk_resources()
 # Download NLTK resources (Ensure this runs once or handle caching)
+# nltk.download(['stopwords', 'wordnet', 'words'])
+# nltk.download('punkt')
+# nltk.download('punkt_tab')
 # Initialize Groq client
 groq_api_key = os.getenv("GROQ_API_KEY")
 groq_client = groq.Groq(api_key=groq_api_key) if groq_api_key else None
         traceback.print_exc()
         return None # Return None on error
+# Initial design for concordance based search
 def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin=10, right_margin=10, numLins=4):
     """
     Function to get all the phrases that contain the target word in a text/passage.
     result = [' '.join(con_sub) for con_sub in concordance_txt]
     return '\n'.join(result) # Use newline for better readability in textbox
+def get_contextual_search_result(target_word, tar_passage, groq_client_instance, max_context_length=8000):
+    """
+    Uses the LLM to provide contextual information about the target word within the passage.
+    """
+    if not target_word or target_word.strip() == "":
+        return "Please enter a search term."
+    if not groq_client_instance:
+        return "Contextual search requires the LLM API. Please set up your GROQ_API_KEY."
+    # Basic check if word exists (optional, LLM can handle it too)
+    if target_word.lower() not in tar_passage.lower():
+        return f"The term '{target_word}' was not found in the manifesto text."
+    # Truncate passage if too long for the model/context window
+    # You might need to adjust this based on your model's limits and desired performance
+    if len(tar_passage) > max_context_length:
+        # Simple truncation; could be improved to ensure sentences are complete
+        tar_passage = tar_passage[:max_context_length]
+        print(f"Warning: Passage truncated for LLM search context to {max_context_length} characters.")
+    prompt = f"""
+    You are given a political manifesto text and a specific search term.
+    Your task is to find all relevant mentions of the search term in the text and provide a concise, informative summary of the context surrounding each mention.
+    Focus on the key ideas, policies, or points related to the search term.
+    If the term is not found or not relevant, state that clearly.
+    Search Term: {target_word}
+    Manifesto Text:
+    {tar_passage}
+    """
+    try:
+        completion = groq_client_instance.chat.completions.create(
+            model="llama3-8b-8192", # Use the same or a suitable model
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant skilled at analyzing political texts and extracting relevant information based on a search query."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0.2, # Low temperature for more factual extraction
+            max_tokens=1000  # Adjust based on expected output length
+        )
+        result = completion.choices[0].message.content.strip()
+        return result if result else f"No specific context for '{target_word}' could be generated."
+    except Exception as e:
+        error_msg = f"Error during contextual search for '{target_word}': {str(e)}"
+        print(error_msg)
+        traceback.print_exc()
+        # Fallback to concordance if LLM fails?
+        # return get_all_phases_containing_tar_wrd_fallback(target_word, tar_passage)
+        return error_msg # Or return the error message directly
 def analysis(Manifesto, Search):
     try:
         if Manifesto is None:
+            # Ensure return order matches the outputs list
             return "No file uploaded", {}, None, None, None, None, None, "No file uploaded"
         if Search.strip() == "":
             Search = "government"
         raw_party = Parsing(Manifesto)
         if isinstance(raw_party, str) and raw_party.startswith("Error"):
             return raw_party, {}, None, None, None, None, None, "Parsing failed"
         text_Party = clean_text(raw_party)
         text_Party_processed = Preprocess(text_Party)
+        # --- Perform Search FIRST using the ORIGINAL text for better context ---
+        # Pass the original raw text for richer context to the LLM
+        searChRes = get_contextual_search_result(Search, raw_party, groq_client)
+        # --- Then proceed with other analyses ---
         summary = generate_summary(raw_party) # Use raw_party for summary for more context?
         # --- Sentiment Analysis ---
         freq_plot = fDistancePlot(text_Party_processed)
         dispersion_plot = DispersionPlot(text_Party_processed)
         wordcloud = word_cloud_generator(Manifesto, text_Party_processed) # Pass Manifesto object itself
         fdist_Party = fDistance(text_Party_processed)
+        # searChRes is now generated earlier
         return searChRes, fdist_Party, sentiment_plot, subjectivity_plot, wordcloud, freq_plot, dispersion_plot, summary
     except Exception as e:
         return error_msg, {}, None, None, None, None, None, "Analysis failed"
+# --- Gradio Interface (remains largely the same, just ensuring output variable names match) ---
 # Use Blocks for custom layout
 with gr.Blocks(title='Manifesto Analysis') as demo:
     gr.Markdown("# Manifesto Analysis")
     # Input Section
     with gr.Row():
         with gr.Column(scale=1): # Adjust scale if needed
         # --- Search Results Tab ---
         with gr.TabItem("Search Results"):
+            # Use the specific output variable defined in the layout
+            search_output = gr.Textbox(label='Context Based Search Results', lines=15, interactive=False, max_lines=20) # Increased lines/max_lines
         # --- Key Topics Tab ---
         with gr.TabItem("Key Topics"):
         fn=analysis,
         inputs=[file_input, search_input],
         outputs=[
+            search_output,        # 1 (Now contextual)
             topics_output,        # 2
             sentiment_output,     # 3
             subjectivity_output,  # 4
 if __name__ == "__main__":
     demo.launch(debug=True, share=False, show_error=True)