Regino commited on
Commit
8d54f3a
Β·
1 Parent(s): 1c11257
Files changed (2) hide show
  1. app.py +91 -24
  2. requirements.txt +5 -2
app.py CHANGED
@@ -1,29 +1,96 @@
1
  import streamlit as st
2
- from transformers import pipeline
 
 
 
 
 
3
 
4
- # Set page title and description
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  st.title("πŸ“„ Text Summarization App")
6
- st.write("""
7
- This app allows users to upload a text file and get a summarized version using a Natural Language Processing (NLP) model.
8
- It uses the `transformers` library from Hugging Face, which provides state-of-the-art machine learning models.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  """)
10
 
11
- # Load summarization pipeline
12
- summarizer = pipeline("summarization")
13
-
14
- # File uploader
15
- uploaded_file = st.file_uploader("Upload a text file", type=["txt"])
16
-
17
- if uploaded_file is not None:
18
- # Read the file content
19
- text = uploaded_file.read().decode("utf-8")
20
-
21
- # Display original text (optional)
22
- st.subheader("Original Text")
23
- st.text_area("Content:", text, height=200)
24
-
25
- # Summarize the text
26
- if st.button("Summarize"):
27
- summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
28
- st.subheader("Summarized Text")
29
- st.write(summary[0]['summary_text'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import fitz # PyMuPDF for PDF extraction
3
+ import re
4
+ from sumy.parsers.plaintext import PlaintextParser
5
+ from sumy.nlp.tokenizers import Tokenizer
6
+ from sumy.summarizers.lsa import LsaSummarizer
7
+ from rouge_score import rouge_scorer # For ROUGE score evaluation
8
 
9
+ # Function to extract text from PDF
10
+ def extract_text_from_pdf(uploaded_file):
11
+ doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
12
+ text = ""
13
+ for page in doc:
14
+ text += page.get_text("text") + "\n"
15
+ return clean_text(text)
16
+
17
+ # Function to clean text (removes unwanted symbols, extra spaces, and bullets)
18
+ def clean_text(text):
19
+ text = re.sub(r"[β€’β–ͺ●◦○▢♦]", "", text) # Remove bullet points
20
+ text = re.sub(r"[\u2022\u2023\u25AA\u25AB\u25A0\u25CF\u00B7]", "", text) # Additional bullets
21
+ text = re.sub(r"\s+", " ", text) # Normalize spaces
22
+ text = re.sub(r"[^a-zA-Z0-9.,!?()'\"%$@&\s]", "", text) # Keep only readable text
23
+ return text.strip()
24
+
25
+ # Function to summarize text using LSA
26
+ def summarize_text(text, num_sentences=3):
27
+ text = clean_text(text) # Clean text before summarizing
28
+ parser = PlaintextParser.from_string(text, Tokenizer("english"))
29
+ summarizer = LsaSummarizer()
30
+ summary = summarizer(parser.document, num_sentences)
31
+ return " ".join(str(sentence) for sentence in summary)
32
+
33
+ # Function to calculate ROUGE scores
34
+ def calculate_rouge(reference_text, generated_summary):
35
+ scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
36
+ scores = scorer.score(reference_text, generated_summary)
37
+
38
+ rouge1 = scores["rouge1"].fmeasure
39
+ rouge2 = scores["rouge2"].fmeasure
40
+ rougeL = scores["rougeL"].fmeasure
41
+
42
+ return rouge1, rouge2, rougeL
43
+
44
+ # Streamlit UI
45
  st.title("πŸ“„ Text Summarization App")
46
+ st.write("This app summarizes long text using **Latent Semantic Analysis (LSA)**, an **unsupervised learning method**, and evaluates the summary using **ROUGE scores**.")
47
+
48
+ # Sidebar input options
49
+ st.sidebar.header("Options")
50
+ file_uploaded = st.sidebar.file_uploader("Upload a file (TXT or PDF)", type=["txt", "pdf"])
51
+ manual_text = st.sidebar.text_area("Or enter text manually", "")
52
+
53
+ # Explanation of the models
54
+ st.subheader("πŸ”Ž How It Works")
55
+ st.markdown("""
56
+ - **Summarization Model: Latent Semantic Analysis (LSA)**
57
+ LSA is an **unsupervised learning method** that identifies important sentences using **Singular Value Decomposition (SVD)**.
58
+ It finds hidden relationships between words and sentences **without requiring labeled data**.
59
+ - **Evaluation Metric: ROUGE Score**
60
+ - **ROUGE-1**: Measures single-word overlap
61
+ - **ROUGE-2**: Measures two-word sequence overlap
62
+ - **ROUGE-L**: Measures the longest common subsequence
63
  """)
64
 
65
+ # Summarization button
66
+ if st.sidebar.button("Summarize"):
67
+ if file_uploaded:
68
+ if file_uploaded.type == "text/plain": # TXT file
69
+ text = file_uploaded.read().decode("utf-8")
70
+ elif file_uploaded.type == "application/pdf": # PDF file
71
+ text = extract_text_from_pdf(file_uploaded)
72
+ else:
73
+ st.sidebar.error("Unsupported file format.")
74
+ st.stop()
75
+ elif manual_text.strip():
76
+ text = manual_text
77
+ else:
78
+ st.sidebar.error("Please upload a file or enter text.")
79
+ st.stop()
80
+
81
+ # Generate summary
82
+ summary = summarize_text(text, num_sentences=5)
83
+
84
+ # Calculate ROUGE score
85
+ rouge1, rouge2, rougeL = calculate_rouge(text, summary)
86
+
87
+ # Display summary in justified format
88
+ st.subheader("πŸ“Œ Summarized Text")
89
+ st.markdown(f"<p style='text-align: justify;'>{summary}</p>", unsafe_allow_html=True)
90
+
91
+ # Display ROUGE scores
92
+ st.subheader("πŸ“Š Summary Quality (ROUGE Score)")
93
+ st.write(f"**ROUGE-1 Score:** {rouge1:.4f}")
94
+ st.write(f"**ROUGE-2 Score:** {rouge2:.4f}")
95
+ st.write(f"**ROUGE-L Score:** {rougeL:.4f}")
96
+
requirements.txt CHANGED
@@ -1,3 +1,6 @@
1
  streamlit
2
- torch
3
- transformers
 
 
 
 
1
  streamlit
2
+ pymupdf
3
+ sumy
4
+ rouge-score
5
+ numpy
6
+ nltk