DOCSUM / app.py
adil9858's picture
Update app.py
80e430b verified
raw
history blame
8.99 kB
import gradio as gr
from openai import OpenAI
import base64
from PIL import Image
import io
import fitz # PyMuPDF
import tempfile
import os
# --- HELPER FUNCTIONS ---
def convert_pdf_to_images(pdf_file):
"""Convert PDF to list of PIL Images"""
images = []
try:
# Save uploaded file to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
tmp_file.write(pdf_file)
tmp_file_path = tmp_file.name
# Open the PDF file
pdf_document = fitz.open(tmp_file_path)
# Iterate through each page
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
pix = page.get_pixmap()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(img)
# Clean up
pdf_document.close()
os.unlink(tmp_file_path)
except Exception as e:
raise gr.Error(f"Error converting PDF: {e}")
return images
def image_to_base64(image):
"""Convert PIL Image to base64 string"""
with io.BytesIO() as buffer:
image.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode("utf-8")
def generate_summary(extracted_texts, api_key):
"""Generate a comprehensive summary of all extracted texts"""
try:
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=api_key
)
summary_prompt = f"""
You are an expert document analyst. Below are the extracted contents from multiple pages of a document.
Please provide a comprehensive, detailed summary that:
1. Organizes all key information logically
2. Identifies relationships between data points
3. Highlights important figures, dates, names
4. Presents the information in a clear, structured format
Extracted contents from pages:
{extracted_texts}
Comprehensive Summary:
"""
response = client.chat.completions.create(
model="opengvlab/internvl3-14b:free",
messages=[
{"role": "system", "content": "You are Dalton, an expert in analyzing and summarizing document contents."},
{"role": "user", "content": summary_prompt}
],
max_tokens=2048
)
return response.choices[0].message.content
except Exception as e:
raise gr.Error(f"Error generating summary: {e}")
def analyze_document(api_key, user_prompt, uploaded_file):
"""Main processing function"""
if not api_key:
raise gr.Error("Please enter your OpenRouter API key")
if uploaded_file is None:
raise gr.Error("Please upload a document")
images_to_analyze = []
file_ext = os.path.splitext(uploaded_file.name)[1].lower()
# Handle PDF or image
if file_ext == '.pdf':
with open(uploaded_file.name, "rb") as f:
pdf_data = f.read()
pdf_images = convert_pdf_to_images(pdf_data)
images_to_analyze = pdf_images # For simplicity, using all pages
else:
image = Image.open(uploaded_file.name)
images_to_analyze = [image]
# Process each image
all_results = []
extracted_texts = []
for idx, image in enumerate(images_to_analyze, 1):
try:
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=api_key
)
image_base64 = image_to_base64(image)
response = client.chat.completions.create(
model="opengvlab/internvl3-14b:free",
messages=[
{"role": "system", "content": "You are Dalton, an expert in understanding images that can analyze images and provide detailed descriptions."},
{"role": "user", "content": [
{"type": "text", "text": user_prompt},
{"type": "image_url", "image_url": {
"url": f"data:image/png;base64,{image_base64}"
}}
]}
],
max_tokens=1024
)
result = response.choices[0].message.content
extracted_texts.append(f"### Page {idx}\n{result}\n")
all_results.append(f"## πŸ“„ Page {idx} Results\n{result}\n---\n")
except Exception as e:
raise gr.Error(f"Error analyzing page {idx}: {e}")
# Generate summary if multiple pages
markdown_output = "\n".join(all_results)
if len(extracted_texts) > 1:
summary = generate_summary("\n".join(extracted_texts), api_key)
markdown_output += f"\n## πŸ“ Comprehensive Summary\n{summary}\n"
# Add structured data section
markdown_output += f"\n## πŸ” Key Data Extracted\n"
markdown_output += "- **Important Figures**: [Extracted values]\n"
markdown_output += "- **Critical Dates**: [Extracted dates]\n"
markdown_output += "- **Main Entities**: [Identified names/companies]\n"
markdown_output += "- **Action Items**: [Key tasks identified]\n"
# Add document metadata
markdown_output += f"\n---\n*Document processed: {uploaded_file.name}*"
return markdown_output
# Custom CSS for dark theme with green text
custom_css = """
:root {
--primary: #00ff00;
--primary-50: #00ff0033;
--primary-100: #00ff0066;
--primary-200: #00ff0099;
--primary-300: #00ff00cc;
--secondary: #00cc00;
--secondary-50: #00cc0033;
--secondary-100: #00cc0066;
--secondary-200: #00cc0099;
--secondary-300: #00cc00cc;
--color-background-primary: #000000;
--color-background-secondary: #111111;
--color-background-tertiary: #222222;
--text-color: #00ff00;
--block-background-fill: #111111;
--block-border-color: #00aa00;
--block-label-text-color: #00ff00;
--block-title-text-color: #00ff00;
--input-background-fill: #111111;
--input-border-color: #00aa00;
--input-text-color: #00ff00;
}
body {
background-color: var(--color-background-primary) !important;
color: var(--text-color) !important;
}
.markdown-output {
padding: 20px;
border-radius: 8px;
background: var(--color-background-secondary);
border: 1px solid var(--block-border-color);
max-height: 600px;
overflow-y: auto;
color: var(--text-color) !important;
}
.markdown-output h1,
.markdown-output h2,
.markdown-output h3 {
color: var(--primary) !important;
border-bottom: 1px solid var(--primary-300);
}
.markdown-output a {
color: var(--secondary) !important;
}
.markdown-output code {
background-color: var(--color-background-tertiary);
color: var(--secondary);
}
.markdown-output pre {
background-color: var(--color-background-tertiary) !important;
border: 1px solid var(--block-border-color);
}
.markdown-output ul,
.markdown-output ol {
color: var(--text-color);
}
button {
background: var(--primary) !important;
color: black !important;
font-weight: bold !important;
}
button:hover {
background: var(--primary-300) !important;
}
"""
# Create dark theme
dark_green_theme = gr.themes.Default(
primary_hue="green",
secondary_hue="green",
neutral_hue="green",
).set(
background_fill_primary="#000000",
background_fill_secondary="#111111",
block_background_fill="#111111",
border_color_accent="#00aa00",
block_label_text_color="#00ff00",
body_text_color="#00ff00",
button_primary_text_color="#000000",
)
# --- GRADIO INTERFACE ---
with gr.Blocks(
title="DocSum - Document Summarizer",
theme=dark_green_theme,
css=custom_css
) as demo:
gr.Markdown("# 🧾 DocSum")
gr.Markdown("Document Summarizer Powered by VLM β€’ Developed by [Koshur AI](https://koshurai.com)")
with gr.Row():
api_key = gr.Textbox(
label="πŸ”‘ OpenRouter API Key",
type="password",
placeholder="Enter your OpenRouter API key"
)
user_prompt = gr.Textbox(
label="πŸ“ Enter Your Prompt",
value="Extract all content structurally",
placeholder="What would you like to extract?"
)
uploaded_file = gr.File(
label="Upload Document (PDF/Image)",
file_types=[".pdf", ".jpg", ".jpeg", ".png"]
)
submit_btn = gr.Button("πŸ” Analyze Document", variant="primary")
# Markdown output with custom class
output = gr.Markdown(
label="Analysis Results",
elem_classes=["markdown-output"]
)
submit_btn.click(
fn=analyze_document,
inputs=[api_key, user_prompt, uploaded_file],
outputs=output
)
if __name__ == "__main__":
demo.launch()