Spaces:

3a05chatgpt
/

pdf-summarizer-app

Sleeping

App Files Files Community

3a05chatgpt commited on 3 days ago

Commit

93c008b

verified ·

1 Parent(s): 3e5b93f

Upload 8 files

Browse files

Files changed (8) hide show

README.md +5 -1
app.py +109 -168
gitattributes +35 -0
papersearch.py +154 -0
pdfpass.py +40 -0
pdfsum.py +125 -0
requirements.txt +0 -0
textsumm.py +28 -0

README.md CHANGED Viewed

@@ -1,3 +1,4 @@
 title: Pdf Tools Suite
 emoji: 📚
 colorFrom: gray
@@ -5,4 +6,7 @@ colorTo: purple
 sdk: streamlit
 sdk_version: 1.42.1
 app_file: app.py
-pinned: false

+---
 title: Pdf Tools Suite
 emoji: 📚
 colorFrom: gray
 sdk: streamlit
 sdk_version: 1.42.1
 app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,168 +1,109 @@
-import openai
-import gradio as gr
-import fitz  # PyMuPDF
-from openai import OpenAI
-import traceback
-# 全域變數
-api_key = ""
-selected_model = "gpt-4"
-summary_text = ""
-client = None
-pdf_text = ""
-def set_api_key(user_api_key):
-    """設定 OpenAI API Key 並初始化客戶端"""
-    global api_key, client
-    try:
-        api_key = user_api_key.strip()
-        if not api_key:
-            return "❌ API Key 不能為空"
-        # 支援新舊 key 格式
-        if not (api_key.startswith('sk-') or api_key.startswith('sk-proj-')):
-            return "❌ API Key 格式錯誤，必須以 'sk-' 或 'sk-proj-' 開頭"
-        client = OpenAI(api_key=api_key)
-        # 測試 API Key 是否有效
-        test_response = client.chat.completions.create(
-            model="gpt-4",
-            messages=[{"role": "user", "content": "你好"}],
-            max_tokens=5
-        )
-        return "✅ API Key 已設定並驗證成功！"
-    except Exception as e:
-        if "incorrect_api_key" in str(e).lower():
-            return "❌ API Key 無效，請檢查是否正確"
-        elif "quota" in str(e).lower():
-            return "⚠️ API Key 有效，但配額不足"
-        else:
-            return f"❌ API Key 設定失敗: {str(e)}"
-def set_model(model_name):
-    global selected_model
-    selected_model = model_name
-    return f"✅ 模型已選擇：{model_name}"
-def extract_pdf_text(file_path):
-    try:
-        doc = fitz.open(file_path)
-        text = ""
-        for page_num, page in enumerate(doc):
-            page_text = page.get_text()
-            if page_text.strip():
-                text += f"\n--- 第 {page_num + 1} 頁 ---\n{page_text}"
-        doc.close()
-        return text
-    except Exception as e:
-        return f"❌ PDF 解析錯誤: {str(e)}"
-def generate_summary(pdf_file):
-    global summary_text, pdf_text
-    if not client:
-        return "❌ 請先設定 OpenAI API Key"
-    if not pdf_file:
-        return "❌ 請先上傳 PDF 文件"
-    try:
-        pdf_text = extract_pdf_text(pdf_file.name)
-        if not pdf_text.strip():
-            return "⚠️ 無法解析 PDF 文字，可能為純圖片 PDF 或空白文件。"
-        pdf_text_truncated = pdf_text[:8000]
-        response = client.chat.completions.create(
-            model=selected_model,
-            messages=[
-                {"role": "system", "content": "請將以下 PDF 內容整理為條列式摘要，用繁體中文回答："},
-                {"role": "user", "content": pdf_text_truncated}
-            ],
-            temperature=0.3
-        )
-        summary_text = response.choices[0].message.content
-        return summary_text
-    except Exception as e:
-        print(traceback.format_exc())
-        return f"❌ 摘要生成失敗: {str(e)}"
-def ask_question(user_question):
-    if not client:
-        return "❌ 請先設定 OpenAI API Key"
-    if not summary_text and not pdf_text:
-        return "❌ 請先生成 PDF 摘要"
-    if not user_question.strip():
-        return "❌ 請輸入問題"
-    try:
-        context = f"PDF 摘要:\n{summary_text}\n\n原始內容（部分）:\n{pdf_text[:2000]}"
-        response = client.chat.completions.create(
-            model=selected_model,
-            messages=[
-                {"role": "system", "content": f"根據以下 PDF 內容回答問題，請用繁體中文回答：\n{context}"},
-                {"role": "user", "content": user_question}
-            ],
-            temperature=0.2
-        )
-        return response.choices[0].message.content
-    except Exception as e:
-        print(traceback.format_exc())
-        return f"❌ 問答生成失敗: {str(e)}"
-def clear_all():
-    global summary_text, pdf_text
-    summary_text = ""
-    pdf_text = ""
-    return "", "", ""
-with gr.Blocks(
-    title="PDF 摘要助手",
-    css="""
-    .gradio-container {
-        max-width: none !important;
-        width: 100% !important;
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
-        min-height: 100vh;
-    }
-    .main-content {
-        max-width: 1600px !important;
-        margin: 20px auto !important;
-        padding: 30px !important;
-        background: rgba(255, 255, 255, 0.95) !important;
-        border-radius: 20px !important;
-    }
-    """
-) as demo:
-    with gr.Column():
-        gr.Markdown("## 📄 PDF 摘要 & 問答助手")
-        with gr.Tab("🔧 設定"):
-            api_key_input = gr.Textbox(label="🔑 輸入 OpenAI API Key", type="password")
-            api_key_status = gr.Textbox(label="API 狀態", interactive=False, value="等待設定 API Key...")
-            api_key_btn = gr.Button("確認 API Key")
-            api_key_btn.click(set_api_key, inputs=api_key_input, outputs=api_key_status)
-            model_choice = gr.Radio(["gpt-4", "gpt-4.1", "gpt-4.5"], label="選擇 AI 模型", value="gpt-4")
-            model_status = gr.Textbox(label="模型狀態", interactive=False, value="✅ 已選擇：gpt-4")
-            model_choice.change(set_model, inputs=model_choice, outputs=model_status)
-        with gr.Tab("📄 摘要"):
-            pdf_upload = gr.File(label="上傳 PDF", file_types=[".pdf"])
-            summary_btn = gr.Button("生成摘要")
-            summary_output = gr.Textbox(label="PDF 摘要", lines=12)
-            summary_btn.click(generate_summary, inputs=pdf_upload, outputs=summary_output)
-        with gr.Tab("❓ 問答"):
-            question_input = gr.Textbox(label="請輸入問題", lines=2)
-            question_btn = gr.Button("送出問題")
-            answer_output = gr.Textbox(label="AI 回答", lines=8)
-            question_btn.click(ask_question, inputs=question_input, outputs=answer_output)
-            question_input.submit(ask_question, inputs=question_input, outputs=answer_output)
-        clear_btn = gr.Button("🗑️ 清除所有資料")
-        clear_btn.click(clear_all, outputs=[summary_output, question_input, answer_output])
-if __name__ == "__main__":
-    demo.launch(
-        show_error=True,
-        share=True,
-        server_name="0.0.0.0",
-        server_port=7860
-    )

+import streamlit as st
+from textsumm import summarizer
+from pdfsum import extract_text_from_pdf, summarize_text, split_text_into_chunks
+from pdfpass import remove_pdf_password
+from papersearch import fetch_papers, filter_papers_by_year
+from io import BytesIO
+from datetime import datetime
+from pypdf import PdfReader, PdfWriter
+# Streamlit App Config
+st.set_page_config(page_title="PDF Tools Suite", page_icon="📄", layout="wide")
+# Sidebar Navigation
+st.sidebar.title("📄 PDF Tools Suite")
+page = st.sidebar.radio("Select a tool", ["Text Summarizer", "PDF Summarizer", "PDF Password Remover", "Research Paper Search", "PDF Merger", "PDF Splitter", "PDF to Text Converter"])
+# Tool: Text Summarizer
+if page == "Text Summarizer":
+    st.title("📝 Text Summarizer")
+    user_input = st.text_area("Enter text to summarize")
+    if st.button("Summarize"):
+        summary = summarizer(user_input, max_length=130, min_length=30, do_sample=False)
+        st.subheader("Summary")
+        st.write(summary[0]["summary_text"])
+# Tool: PDF Summarizer
+elif page == "PDF Summarizer":
+    st.title("📜 PDF Summarizer")
+    uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
+    if uploaded_file is not None:
+        pdf_text = extract_text_from_pdf(uploaded_file)
+        chunks = split_text_into_chunks(pdf_text)
+        summaries = summarize_text(chunks)
+        full_summary = " ".join(summaries)
+        st.subheader("Summary")
+        st.write(full_summary)
+# Tool: PDF Password Remover
+elif page == "PDF Password Remover":
+    st.title("🔑 Remove PDF Password")
+    uploaded_file = st.file_uploader("Choose a password-protected PDF", type=["pdf"])
+    password = st.text_input("Enter the PDF password", type="password")
+    if uploaded_file and password and st.button("Remove Password"):
+        output = remove_pdf_password(uploaded_file, password)
+        if isinstance(output, BytesIO):
+            st.success("Password removed successfully!")
+            st.download_button("Download PDF", data=output, file_name="unlocked_pdf.pdf", mime="application/pdf")
+        else:
+            st.error(f"Error: {output}")
+# Tool: Research Paper Search
+elif page == "Research Paper Search":
+    st.title("🔍 Research Paper Search (arXiv)")
+    query = st.text_input("Enter topic or keywords", placeholder="e.g., machine learning")
+    max_results = st.slider("Number of results", 1, 50, 10)
+    col1, col2 = st.columns(2)
+    with col1:
+        start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000)
+    with col2:
+        end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year)
+    if st.button("Search"):
+        papers = fetch_papers(query, max_results)
+        papers_filtered = filter_papers_by_year(papers, start_year, end_year)
+        if papers_filtered:
+            for idx, paper in enumerate(papers_filtered, start=1):
+                st.write(f"### {idx}. {paper['title']}")
+                st.write(f"**Authors**: {', '.join(paper['authors'])}")
+                st.write(f"**Published**: {paper['published']}")
+                st.write(f"[Read More]({paper['link']})")
+                st.write("---")
+        else:
+            st.warning("No papers found in the selected range.")
+# Tool: PDF Merger
+elif page == "PDF Merger":
+    st.title("📎 Merge Multiple PDFs")
+    uploaded_files = st.file_uploader("Upload multiple PDF files", type=["pdf"], accept_multiple_files=True)
+    if uploaded_files and st.button("Merge PDFs"):
+        pdf_writer = PdfWriter()
+        for file in uploaded_files:
+            pdf_reader = PdfReader(file)
+            for page in pdf_reader.pages:
+                pdf_writer.add_page(page)
+        output = BytesIO()
+        pdf_writer.write(output)
+        output.seek(0)
+        st.download_button("Download Merged PDF", data=output, file_name="merged.pdf", mime="application/pdf")
+# Tool: PDF Splitter
+elif page == "PDF Splitter":
+    st.title("✂️ Split PDF into Pages")
+    uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
+    if uploaded_file:
+        pdf_reader = PdfReader(uploaded_file)
+        for i, page in enumerate(pdf_reader.pages):
+            pdf_writer = PdfWriter()
+            pdf_writer.add_page(page)
+            output = BytesIO()
+            pdf_writer.write(output)
+            output.seek(0)
+            st.download_button(f"Download Page {i+1}", data=output, file_name=f"page_{i+1}.pdf", mime="application/pdf")
+# Tool: PDF to Text Converter
+elif page == "PDF to Text Converter":
+    st.title("📜 Extract Text from PDF")
+    uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
+    if uploaded_file:
+        pdf_text = extract_text_from_pdf(uploaded_file)
+        st.text_area("Extracted Text", pdf_text, height=300)

gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

papersearch.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# import streamlit as st
+# import requests
+# import xmltodict
+# # arXiv API base URL
+# ARXIV_API_BASE = "http://export.arxiv.org/api/query"
+# def fetch_papers(query, max_results=10):
+#     """Fetch papers from the arXiv API."""
+#     try:
+#         # Build the API query URL
+#         api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
+#         # Make the API request
+#         response = requests.get(api_url, headers={'Accept': 'application/xml'})
+#         response.raise_for_status()
+#         # Parse the XML response
+#         data = xmltodict.parse(response.text)
+#         entries = data.get('feed', {}).get('entry', [])
+#         if not isinstance(entries, list):  # Handle single result
+#             entries = [entries]
+#         # Extract relevant fields
+#         papers = []
+#         for entry in entries:
+#             papers.append({
+#                 'title': entry.get('title'),
+#                 'summary': entry.get('summary'),
+#                 'published': entry.get('published'),
+#                 'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
+#                 'link': entry.get('id')
+#             })
+#         return papers
+#     except Exception as e:
+#         st.error(f"Error fetching papers: {e}")
+#         return []
+# # Streamlit app UI
+# st.title("arXiv Research Paper Search")
+# st.subheader("Find academic papers on your topic of interest")
+# # Input fields
+# query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
+# max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
+# if st.button("Search"):
+#     if query.strip():
+#         st.info(f"Searching for papers on: **{query}**")
+#         papers = fetch_papers(query, max_results)
+#         if papers:
+#             st.success(f"Found {len(papers)} papers!")
+#             for idx, paper in enumerate(papers, start=1):
+#                 st.write(f"### {idx}. {paper['title']}")
+#                 st.write(f"**Authors**: {', '.join(paper['authors'])}")
+#                 st.write(f"**Published**: {paper['published']}")
+#                 st.write(f"[Read More]({paper['link']})")
+#                 st.write("---")
+#         else:
+#             st.warning("No papers found. Try a different query.")
+#     else:
+#         st.error("Please enter a topic or keywords to search.")
+import streamlit as st
+import requests
+import xmltodict
+from datetime import datetime
+# arXiv API base URL
+ARXIV_API_BASE = "http://export.arxiv.org/api/query"
+def fetch_papers(query, max_results=10):
+    """Fetch papers from the arXiv API."""
+    try:
+        # Build the API query URL
+        api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
+        # Make the API request
+        response = requests.get(api_url, headers={'Accept': 'application/xml'})
+        response.raise_for_status()
+        # Parse the XML response
+        data = xmltodict.parse(response.text)
+        entries = data.get('feed', {}).get('entry', [])
+        if not isinstance(entries, list):  # Handle single result
+            entries = [entries]
+        # Extract relevant fields
+        papers = []
+        for entry in entries:
+            papers.append({
+                'title': entry.get('title'),
+                'summary': entry.get('summary'),
+                'published': entry.get('published'),
+                'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
+                'link': entry.get('id')
+            })
+        return papers
+    except Exception as e:
+        st.error(f"Error fetching papers: {e}")
+        return []
+def filter_papers_by_year(papers, start_year, end_year):
+    """Filter papers by the publication year range."""
+    filtered_papers = []
+    for paper in papers:
+        try:
+            published_year = int(paper['published'][:4])  # Extract year from the published date
+            if start_year <= published_year <= end_year:
+                filtered_papers.append(paper)
+        except ValueError:
+            continue  # Skip if the year is not valid
+    return filtered_papers
+# Streamlit app UI
+st.title("arXiv Research Paper Search")
+st.subheader("Find academic papers on your topic of interest")
+# Input fields
+query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
+max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
+# Year filter
+col1, col2 = st.columns(2)
+with col1:
+    start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000, step=1)
+with col2:
+    end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year, step=1)
+if st.button("Search"):
+    if query.strip():
+        st.info(f"Searching for papers on: **{query}**")
+        papers = fetch_papers(query, max_results)
+        # Filter papers by year
+        papers_filtered = filter_papers_by_year(papers, start_year, end_year)
+        if papers_filtered:
+            st.success(f"Found {len(papers_filtered)} papers between {start_year} and {end_year}!")
+            for idx, paper in enumerate(papers_filtered, start=1):
+                st.write(f"### {idx}. {paper['title']}")
+                st.write(f"**Authors**: {', '.join(paper['authors'])}")
+                st.write(f"**Published**: {paper['published']}")
+                st.write(f"[Read More]({paper['link']})")
+                st.write("---")
+        else:
+            st.warning(f"No papers found between {start_year} and {end_year}. Try a different query or adjust the year range.")
+    else:
+        st.error("Please enter a topic or keywords to search.")

pdfpass.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import streamlit as st
+from PyPDF2 import PdfReader, PdfWriter
+from io import BytesIO
+def remove_pdf_password(file, password):
+    try:
+        reader = PdfReader(file)
+        if reader.is_encrypted:
+            reader.decrypt(password)
+        writer = PdfWriter()
+        for page in reader.pages:
+            writer.add_page(page)
+        output = BytesIO()
+        writer.write(output)
+        output.seek(0)
+        return output
+    except Exception as e:
+        return str(e)
+st.title("PDF Password Remover")
+st.write("Upload a password-protected PDF and remove its password.")
+# File upload
+uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
+password = st.text_input("Enter the PDF password", type="password")
+if uploaded_file and password:
+    if st.button("Remove Password"):
+        output = remove_pdf_password(uploaded_file, password)
+        if isinstance(output, BytesIO):
+            st.success("Password removed successfully!")
+            st.download_button(
+                label="Download PDF without Password",
+                data=output,
+                file_name="unlocked_pdf.pdf",
+                mime="application/pdf",
+            )
+        else:
+            st.error(f"Error: {output}")

pdfsum.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# import streamlit as st
+# from transformers import pipeline
+# from PyPDF2 import PdfReader
+# # Initialize the summarizer
+# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+# def extract_text_from_pdf(pdf_file):
+#     """Extract text from an uploaded PDF file."""
+#     try:
+#         reader = PdfReader(pdf_file)
+#         text = ""
+#         for page in reader.pages:
+#             page_text = page.extract_text()
+#             if page_text:  # Skip pages with no text
+#                 text += page_text + "\n"
+#         return text
+#     except Exception as e:
+#         raise ValueError(f"Error extracting text from PDF: {e}")
+# def split_text_into_chunks(text, max_chunk_size=1024):
+#     """Split the text into smaller chunks for summarization."""
+#     chunks = []
+#     while len(text) > max_chunk_size:
+#         split_point = text.rfind(". ", 0, max_chunk_size) + 1  # Split at the last sentence boundary
+#         if split_point == 0:  # No sentence boundary found, split arbitrarily
+#             split_point = max_chunk_size
+#         chunks.append
+# # Streamlit Dashboard
+# st.title("PDF Summarizer")
+# st.write("Upload a PDF file to get a summarized version of its content.")
+# uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
+# if uploaded_file is not None:
+#     # Extract text from the PDF
+#     st.write("Processing your PDF...")
+#     try:
+#         pdf_text = extract_text_from_pdf(uploaded_file)
+#         st.write("PDF content extracted successfully.")
+#         # Display extracted text (optional)
+#         with st.expander("View Extracted Text"):
+#             st.text_area("Extracted Text", pdf_text, height=300)
+#         # Summarize the extracted text
+#         if st.button("Summarize"):
+#             st.write("Generating summary...")
+#             summary = summarizer(pdf_text, max_length=130, min_length=30, do_sample=False)
+#             st.subheader("Summary")
+#             st.write(summary[0]["summary_text"])
+#     except Exception as e:
+#         st.error(f"An error occurred while processing the PDF: {str(e)}")
+import streamlit as st
+from transformers import pipeline
+import pdfplumber
+# Initialize the summarizer
+summarizer = pipeline("summarization", model="t5-small")
+def extract_text_from_pdf(pdf_file):
+    """Extract text from an uploaded PDF file using pdfplumber."""
+    try:
+        text = ""
+        with pdfplumber.open(pdf_file) as pdf:
+            for page in pdf.pages:
+                text += page.extract_text() + "\n"
+        if not text.strip():
+            raise ValueError("No extractable text found in the PDF.")
+        return text
+    except Exception as e:
+        raise ValueError(f"Error extracting text from PDF: {e}")
+def split_text_into_chunks(text, max_chunk_size=1024):
+    """Split the text into smaller chunks for summarization."""
+    chunks = []
+    while len(text) > max_chunk_size:
+        split_point = text.rfind(". ", 0, max_chunk_size) + 1  # Find the last full sentence
+        if split_point == 0:  # No sentence boundary found, split arbitrarily
+            split_point = max_chunk_size
+        chunks.append(text[:split_point])
+        text = text[split_point:]
+    if text:
+        chunks.append(text)
+    return chunks
+def summarize_text(chunks):
+    """Summarize each chunk of text with dynamic max_length."""
+    summaries = []
+    for chunk in chunks:
+        input_length = len(chunk.split())  # Approximate token count
+        max_length = max(48, int(input_length * 0.8))  # Set max_length to 80% of input length
+        summary = summarizer(chunk, max_length=max_length, min_length=10, do_sample=False)
+        summaries.append(summary[0]["summary_text"])
+    return summaries
+# Streamlit Dashboard
+st.title("PDF Summarizer")
+st.write("Upload a PDF file to get a summarized version of its content.")
+uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
+if uploaded_file is not None:
+    try:
+        # Extract text from the PDF
+        st.write("Processing your PDF...")
+        pdf_text = extract_text_from_pdf(uploaded_file)
+        st.write("PDF content extracted successfully.")
+        # Display extracted text (optional)
+        with st.expander("View Extracted Text"):
+            st.text_area("Extracted Text", pdf_text, height=300)
+        # Summarize the extracted text
+        if st.button("Summarize"):
+            st.write("Generating summary...")
+            chunks = split_text_into_chunks(pdf_text)
+            summaries = summarize_text(chunks)
+            full_summary = " ".join(summaries)
+            st.subheader("Summary")
+            st.write(full_summary)
+    except Exception as e:
+        st.error(f"An error occurred while processing the PDF: {str(e)}")

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

textsumm.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from transformers import pipeline
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+ARTICLE ="""
+There is widespread international concern that Russia's war will provoke a global food crisis similar to, or
+worse than, that faced in 2007 and 2008. The war comes at a time when the global food system was already
+struggling to feed its growing population in a sustainable way, under the pressure caused by climate change
+and the Covid-19 pandemic. Russia and Ukraine are key agricultural players, together exporting nearly 12
+% of food calories traded globally. They are major providers of basic agro-commodities, including wheat,
+maize and sunflower oil, and Russia is the world's top exporter of fertilisers. The global supply chain will
+get impacted until Russia and Ukraine retreat and will end the war.
+The war's impact on global food supply centred on three factors. First is a significant reduction in exports
+and production of essential commodities from both countries, caused by the war and not the economic
+sanctions imposed on Russia, which, intentionally, did not target the agricultural sector. Overall, the
+European Commission estimates that 'up to 25 million tonnes of wheat would need to be substituted to
+meet worldwide food needs in the current and the next season. Second factor is a global spike in prices of
+food supplies and inputs needed for agri-food production, which were already at record levels before the
+war. The war has further pushed the prices up. Third factor is the international response to the above,
+which could either amplify the effects of the crisis (mainly by uncoordinated export bans) or mitigate them
+(applying lessons learnt from the 2007-2008 food crisis). A number of countries, other than Russia and
+Ukraine, have already imposed or announced their intention to impose some control over exports of
+essential agricultural commodities, including Egypt, Argentina, Indonesia, Serbia, Turkey and, in the EU,
+Hungary. We should keep this in our mind that the long duration of war will make the global situation
+irrecoverable.
+"""
+print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))