File size: 4,068 Bytes
fee8ffb
590870f
8d54f3a
 
 
 
 
 
590870f
021c961
 
4c69665
8d54f3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c11257
8d54f3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c11257
590870f
8d54f3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
021c961
 
 
 
 
 
8d54f3a
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import nltk
import streamlit as st
import fitz  # PyMuPDF for PDF extraction
import re
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from rouge_score import rouge_scorer  # For ROUGE score evaluation

# Ensure the necessary tokenizer is downloaded
nltk.download("punkt_tab")

# Function to extract text from PDF
def extract_text_from_pdf(uploaded_file):
    doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return clean_text(text)

# Function to clean text (removes unwanted symbols, extra spaces, and bullets)
def clean_text(text):
    text = re.sub(r"[β€’β–ͺ●◦○▢♦]", "", text)  # Remove bullet points
    text = re.sub(r"[\u2022\u2023\u25AA\u25AB\u25A0\u25CF\u00B7]", "", text)  # Additional bullets
    text = re.sub(r"\s+", " ", text)  # Normalize spaces
    text = re.sub(r"[^a-zA-Z0-9.,!?()'\"%$@&\s]", "", text)  # Keep only readable text
    return text.strip()

# Function to summarize text using LSA
def summarize_text(text, num_sentences=3):
    text = clean_text(text)  # Clean text before summarizing
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return " ".join(str(sentence) for sentence in summary)

# Function to calculate ROUGE scores
def calculate_rouge(reference_text, generated_summary):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = scorer.score(reference_text, generated_summary)

    rouge1 = scores["rouge1"].fmeasure
    rouge2 = scores["rouge2"].fmeasure
    rougeL = scores["rougeL"].fmeasure

    return rouge1, rouge2, rougeL

# Streamlit UI
st.title("πŸ“„ Text Summarization App")
st.write("This app summarizes long text using **Latent Semantic Analysis (LSA)**, an **unsupervised learning method**, and evaluates the summary using **ROUGE scores**.")

# Sidebar input options
st.sidebar.header("Options")
file_uploaded = st.sidebar.file_uploader("Upload a file (TXT or PDF)", type=["txt", "pdf"])
manual_text = st.sidebar.text_area("Or enter text manually", "")

# Explanation of the models
st.subheader("πŸ”Ž How It Works")
st.markdown("""
- **Summarization Model: Latent Semantic Analysis (LSA)**  
  LSA is an **unsupervised learning method** that identifies important sentences using **Singular Value Decomposition (SVD)**.  
  It finds hidden relationships between words and sentences **without requiring labeled data**.  
- **Evaluation Metric: ROUGE Score**  
  - **ROUGE-1**: Measures single-word overlap  
  - **ROUGE-2**: Measures two-word sequence overlap  
  - **ROUGE-L**: Measures the longest common subsequence  
""")

# Summarization button
if st.sidebar.button("Summarize"):
    if file_uploaded:
        if file_uploaded.type == "text/plain":  # TXT file
            text = file_uploaded.read().decode("utf-8")
        elif file_uploaded.type == "application/pdf":  # PDF file
            text = extract_text_from_pdf(file_uploaded)
        else:
            st.sidebar.error("Unsupported file format.")
            st.stop()
    elif manual_text.strip():
        text = manual_text
    else:
        st.sidebar.error("Please upload a file or enter text.")
        st.stop()

    # Show loading animation
    with st.spinner("Summarizing text... Please wait."):
        # Generate summary
        summary = summarize_text(text, num_sentences=5)
        # Calculate ROUGE score
        rouge1, rouge2, rougeL = calculate_rouge(text, summary)

    # Display summary in justified format
    st.subheader("πŸ“Œ Summarized Text")
    st.markdown(f"<p style='text-align: justify;'>{summary}</p>", unsafe_allow_html=True)

    # Display ROUGE scores
    st.subheader("πŸ“Š Summary Quality (ROUGE Score)")
    st.write(f"**ROUGE-1 Score:** {rouge1:.4f}")
    st.write(f"**ROUGE-2 Score:** {rouge2:.4f}")
    st.write(f"**ROUGE-L Score:** {rougeL:.4f}")