Spaces:
Sleeping
Sleeping
import nltk | |
import streamlit as st | |
import fitz # PyMuPDF for PDF extraction | |
import re | |
from sumy.parsers.plaintext import PlaintextParser | |
from sumy.nlp.tokenizers import Tokenizer | |
from sumy.summarizers.lsa import LsaSummarizer | |
from rouge_score import rouge_scorer # For ROUGE score evaluation | |
# Ensure the necessary tokenizer is downloaded | |
nltk.download("punkt_tab") | |
# Function to extract text from PDF | |
def extract_text_from_pdf(uploaded_file): | |
doc = fitz.open(stream=uploaded_file.read(), filetype="pdf") | |
text = "" | |
for page in doc: | |
text += page.get_text("text") + "\n" | |
return clean_text(text) | |
# Function to clean text (removes unwanted symbols, extra spaces, and bullets) | |
def clean_text(text): | |
text = re.sub(r"[β’βͺββ¦ββΆβ¦]", "", text) # Remove bullet points | |
text = re.sub(r"[\u2022\u2023\u25AA\u25AB\u25A0\u25CF\u00B7]", "", text) # Additional bullets | |
text = re.sub(r"\s+", " ", text) # Normalize spaces | |
text = re.sub(r"[^a-zA-Z0-9.,!?()'\"%$@&\s]", "", text) # Keep only readable text | |
return text.strip() | |
# Function to summarize text using LSA | |
def summarize_text(text, num_sentences=3): | |
text = clean_text(text) # Clean text before summarizing | |
parser = PlaintextParser.from_string(text, Tokenizer("english")) | |
summarizer = LsaSummarizer() | |
summary = summarizer(parser.document, num_sentences) | |
return " ".join(str(sentence) for sentence in summary) | |
# Function to calculate ROUGE scores | |
def calculate_rouge(reference_text, generated_summary): | |
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True) | |
scores = scorer.score(reference_text, generated_summary) | |
rouge1 = scores["rouge1"].fmeasure | |
rouge2 = scores["rouge2"].fmeasure | |
rougeL = scores["rougeL"].fmeasure | |
return rouge1, rouge2, rougeL | |
# Streamlit UI | |
st.title("π Text Summarization App") | |
st.write("This app summarizes long text using **Latent Semantic Analysis (LSA)**, an **unsupervised learning method**, and evaluates the summary using **ROUGE scores**.") | |
# Sidebar input options | |
st.sidebar.header("Options") | |
file_uploaded = st.sidebar.file_uploader("Upload a file (TXT or PDF)", type=["txt", "pdf"]) | |
manual_text = st.sidebar.text_area("Or enter text manually", "") | |
# Explanation of the models | |
st.subheader("π How It Works") | |
st.markdown(""" | |
- **Summarization Model: Latent Semantic Analysis (LSA)** | |
LSA is an **unsupervised learning method** that identifies important sentences using **Singular Value Decomposition (SVD)**. | |
It finds hidden relationships between words and sentences **without requiring labeled data**. | |
- **Evaluation Metric: ROUGE Score** | |
- **ROUGE-1**: Measures single-word overlap | |
- **ROUGE-2**: Measures two-word sequence overlap | |
- **ROUGE-L**: Measures the longest common subsequence | |
""") | |
# Summarization button | |
if st.sidebar.button("Summarize"): | |
if file_uploaded: | |
if file_uploaded.type == "text/plain": # TXT file | |
text = file_uploaded.read().decode("utf-8") | |
elif file_uploaded.type == "application/pdf": # PDF file | |
text = extract_text_from_pdf(file_uploaded) | |
else: | |
st.sidebar.error("Unsupported file format.") | |
st.stop() | |
elif manual_text.strip(): | |
text = manual_text | |
else: | |
st.sidebar.error("Please upload a file or enter text.") | |
st.stop() | |
# Show loading animation | |
with st.spinner("Summarizing text... Please wait."): | |
# Generate summary | |
summary = summarize_text(text, num_sentences=5) | |
# Calculate ROUGE score | |
rouge1, rouge2, rougeL = calculate_rouge(text, summary) | |
# Display summary in justified format | |
st.subheader("π Summarized Text") | |
st.markdown(f"<p style='text-align: justify;'>{summary}</p>", unsafe_allow_html=True) | |
# Display ROUGE scores | |
st.subheader("π Summary Quality (ROUGE Score)") | |
st.write(f"**ROUGE-1 Score:** {rouge1:.4f}") | |
st.write(f"**ROUGE-2 Score:** {rouge2:.4f}") | |
st.write(f"**ROUGE-L Score:** {rougeL:.4f}") | |