Regino
fbng
021c961
import nltk
import streamlit as st
import fitz # PyMuPDF for PDF extraction
import re
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from rouge_score import rouge_scorer # For ROUGE score evaluation
# Ensure the necessary tokenizer is downloaded
nltk.download("punkt_tab")
# Function to extract text from PDF
def extract_text_from_pdf(uploaded_file):
doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text("text") + "\n"
return clean_text(text)
# Function to clean text (removes unwanted symbols, extra spaces, and bullets)
def clean_text(text):
text = re.sub(r"[β€’β–ͺ●◦○▢♦]", "", text) # Remove bullet points
text = re.sub(r"[\u2022\u2023\u25AA\u25AB\u25A0\u25CF\u00B7]", "", text) # Additional bullets
text = re.sub(r"\s+", " ", text) # Normalize spaces
text = re.sub(r"[^a-zA-Z0-9.,!?()'\"%$@&\s]", "", text) # Keep only readable text
return text.strip()
# Function to summarize text using LSA
def summarize_text(text, num_sentences=3):
text = clean_text(text) # Clean text before summarizing
parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer = LsaSummarizer()
summary = summarizer(parser.document, num_sentences)
return " ".join(str(sentence) for sentence in summary)
# Function to calculate ROUGE scores
def calculate_rouge(reference_text, generated_summary):
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
scores = scorer.score(reference_text, generated_summary)
rouge1 = scores["rouge1"].fmeasure
rouge2 = scores["rouge2"].fmeasure
rougeL = scores["rougeL"].fmeasure
return rouge1, rouge2, rougeL
# Streamlit UI
st.title("πŸ“„ Text Summarization App")
st.write("This app summarizes long text using **Latent Semantic Analysis (LSA)**, an **unsupervised learning method**, and evaluates the summary using **ROUGE scores**.")
# Sidebar input options
st.sidebar.header("Options")
file_uploaded = st.sidebar.file_uploader("Upload a file (TXT or PDF)", type=["txt", "pdf"])
manual_text = st.sidebar.text_area("Or enter text manually", "")
# Explanation of the models
st.subheader("πŸ”Ž How It Works")
st.markdown("""
- **Summarization Model: Latent Semantic Analysis (LSA)**
LSA is an **unsupervised learning method** that identifies important sentences using **Singular Value Decomposition (SVD)**.
It finds hidden relationships between words and sentences **without requiring labeled data**.
- **Evaluation Metric: ROUGE Score**
- **ROUGE-1**: Measures single-word overlap
- **ROUGE-2**: Measures two-word sequence overlap
- **ROUGE-L**: Measures the longest common subsequence
""")
# Summarization button
if st.sidebar.button("Summarize"):
if file_uploaded:
if file_uploaded.type == "text/plain": # TXT file
text = file_uploaded.read().decode("utf-8")
elif file_uploaded.type == "application/pdf": # PDF file
text = extract_text_from_pdf(file_uploaded)
else:
st.sidebar.error("Unsupported file format.")
st.stop()
elif manual_text.strip():
text = manual_text
else:
st.sidebar.error("Please upload a file or enter text.")
st.stop()
# Show loading animation
with st.spinner("Summarizing text... Please wait."):
# Generate summary
summary = summarize_text(text, num_sentences=5)
# Calculate ROUGE score
rouge1, rouge2, rougeL = calculate_rouge(text, summary)
# Display summary in justified format
st.subheader("πŸ“Œ Summarized Text")
st.markdown(f"<p style='text-align: justify;'>{summary}</p>", unsafe_allow_html=True)
# Display ROUGE scores
st.subheader("πŸ“Š Summary Quality (ROUGE Score)")
st.write(f"**ROUGE-1 Score:** {rouge1:.4f}")
st.write(f"**ROUGE-2 Score:** {rouge2:.4f}")
st.write(f"**ROUGE-L Score:** {rougeL:.4f}")