Tulika2000 commited on
Commit
dcc2356
·
verified ·
1 Parent(s): 8ba0424

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1XblbxoRxB4XOHixjGij789FPD9KjKdhi
8
+ """
9
+
10
+ import os
11
+ import pdfplumber
12
+ import gradio as gr
13
+ from langchain_groq.chat_models import ChatGroq
14
+
15
+ # Set Groq API key securely
16
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY") # Fetch from environment variables
17
+ if not GROQ_API_KEY:
18
+ raise ValueError("GROQ_API_KEY is not set. Add it in Hugging Face Secrets.")
19
+
20
+ # Initialize LLM (Mistral-8x7B)
21
+ llm = ChatGroq(model_name="mixtral-8x7b-32768")
22
+
23
+ def extract_text_from_pdf(pdf_file):
24
+ """Extracts clean text from a text-based PDF while handling edge cases."""
25
+ text = ""
26
+ try:
27
+ with pdfplumber.open(pdf_file) as pdf:
28
+ for page in pdf.pages:
29
+ page_text = page.extract_text()
30
+ if page_text:
31
+ text += page_text.strip() + "\n\n" # Keep formatting clean
32
+ except Exception as e:
33
+ return f"Error extracting text: {str(e)}"
34
+
35
+ if not text.strip():
36
+ return "⚠️ No readable text found. This might be a scanned or image-based PDF."
37
+
38
+ return text.strip()
39
+
40
+ def summarize_text(text, length, style):
41
+ """Summarizes extracted text using Mistral-8x7B with structured formatting."""
42
+ prompt = (
43
+ f"""
44
+ Read the following document and summarize it in {style.lower()} format.
45
+ Keep the summary {length.lower()}.
46
+ Follow this structured reasoning:
47
+ 1. Identify key sections & main topics.
48
+ 2. Extract essential points from each section.
49
+ 3. Remove redundant information.
50
+ 4. Ensure accuracy without hallucination.
51
+
52
+ Document:
53
+ {text[:10000]} # Limit input to 10,000 characters for efficiency
54
+ """
55
+ )
56
+ response = llm.predict(prompt)
57
+ return response.strip()
58
+
59
+ def process_pdf(file, length, style):
60
+ """Extracts text and summarizes PDF with customization options."""
61
+ if not file:
62
+ return "⚠️ No file uploaded. Please upload a PDF."
63
+
64
+ text = extract_text_from_pdf(file.name)
65
+ if text.startswith("⚠️") or text.startswith("Error"):
66
+ return text # Return error messages directly
67
+
68
+ return summarize_text(text, length, style)
69
+
70
+ # Create Gradio Interface
71
+ interface = gr.Interface(
72
+ fn=process_pdf,
73
+ inputs=[
74
+ gr.File(label="📄 Upload a PDF"),
75
+ gr.Radio(["Short", "Medium", "Long"], label="📏 Summary Length", value="Medium"),
76
+ gr.Radio(["Bullets", "Key Takeaways", "Concise Paragraph"], label="📌 Summary Style", value="Key Takeaways"),
77
+ ],
78
+ outputs="text",
79
+ title="📄 PDF Summarizer (Text-Based PDFs Only)",
80
+ description="Upload a PDF file (text-based only) and get a structured summary. Not for scanned/image PDFs.",
81
+ )
82
+
83
+ # Run the app
84
+ interface.launch()