DeepSoft-Tech's picture
Update app.py
a2f575f verified
# -*- coding: utf-8 -*-
"""app.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1XblbxoRxB4XOHixjGij789FPD9KjKdhi
"""
import os
import pdfplumber
import gradio as gr
from langchain_groq.chat_models import ChatGroq
# Set Groq API key securely
GROQ_API_KEY = os.getenv("GROQ_API_KEY") # Fetch from environment variables
if not GROQ_API_KEY:
raise ValueError("GROQ_API_KEY is not set. Add it in Hugging Face Secrets.")
# Initialize LLM
llm = ChatGroq(model_name="llama-3.3-70b-versatile")
def extract_text_from_pdf(pdf_file):
"""Extracts clean text from a text-based PDF while handling edge cases."""
text = ""
try:
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text.strip() + "\n\n" # Keep formatting clean
except Exception as e:
return f"Error extracting text: {str(e)}"
if not text.strip():
return "⚠️ No readable text found. This might be a scanned or image-based PDF."
return text.strip()
def summarize_text(text, length, style):
"""Summarizes extracted text with structured formatting."""
prompt = (
f"""
Read the following document and summarize it in {style.lower()} format.
Keep the summary {length.lower()}.
Follow this structured reasoning:
1. Identify key sections & main topics.
2. Extract essential points from each section.
3. Remove redundant information.
4. Ensure accuracy without hallucination.
Document:
{text[:10000]} # Limit input to 10,000 characters for efficiency
"""
)
response = llm.predict(prompt)
return response.strip()
def process_pdf(file, length, style):
"""Extracts text and summarizes PDF with customization options."""
if not file:
return "⚠️ No file uploaded. Please upload a PDF."
text = extract_text_from_pdf(file.name)
if text.startswith("⚠️") or text.startswith("Error"):
return text # Return error messages directly
return summarize_text(text, length, style)
# Create Gradio Interface
interface = gr.Interface(
fn=process_pdf,
inputs=[
gr.File(label="πŸ“„ Upload a PDF"),
gr.Radio(["Short", "Medium", "Long"], label="πŸ“ Summary Length", value="Medium"),
gr.Radio(["Bullets", "Key Takeaways", "Concise Paragraph"], label="πŸ“Œ Summary Style", value="Key Takeaways"),
],
outputs="text",
title="πŸ“„ Insurance Policy Document Summarizer",
description="Upload a policy and get summary.",
)
# Run the app
interface.launch()