import gradio as gr from openai import OpenAI import base64 from PIL import Image import io import fitz # PyMuPDF import tempfile import os # --- HELPER FUNCTIONS --- def convert_pdf_to_images(pdf_file): """Convert PDF to list of PIL Images""" images = [] try: # Save uploaded file to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: tmp_file.write(pdf_file) tmp_file_path = tmp_file.name # Open the PDF file pdf_document = fitz.open(tmp_file_path) # Iterate through each page for page_num in range(len(pdf_document)): page = pdf_document.load_page(page_num) pix = page.get_pixmap() img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) images.append(img) # Clean up pdf_document.close() os.unlink(tmp_file_path) except Exception as e: raise gr.Error(f"Error converting PDF: {e}") return images def image_to_base64(image): """Convert PIL Image to base64 string""" with io.BytesIO() as buffer: image.save(buffer, format="PNG") return base64.b64encode(buffer.getvalue()).decode("utf-8") def generate_summary(extracted_texts, api_key): """Generate a comprehensive summary of all extracted texts""" try: client = OpenAI( base_url="https://openrouter.ai/api/v1", api_key=api_key ) summary_prompt = f""" You are an expert document analyst. Below are the extracted contents from multiple pages of a document. Please provide a comprehensive, detailed summary that: 1. Organizes all key information logically 2. Identifies relationships between data points 3. Highlights important figures, dates, names 4. Presents the information in a clear, structured format Extracted contents from pages: {extracted_texts} Comprehensive Summary: """ response = client.chat.completions.create( model="opengvlab/internvl3-14b:free", messages=[ {"role": "system", "content": "You are Dalton, an expert in analyzing and summarizing document contents."}, {"role": "user", "content": summary_prompt} ], max_tokens=2048 ) return response.choices[0].message.content except Exception as e: raise gr.Error(f"Error generating summary: {e}") def analyze_document(api_key, user_prompt, uploaded_file): """Main processing function""" if not api_key: raise gr.Error("Please enter your OpenRouter API key") if uploaded_file is None: raise gr.Error("Please upload a document") images_to_analyze = [] file_ext = os.path.splitext(uploaded_file.name)[1].lower() # Handle PDF or image if file_ext == '.pdf': with open(uploaded_file.name, "rb") as f: pdf_data = f.read() pdf_images = convert_pdf_to_images(pdf_data) images_to_analyze = pdf_images # For simplicity, using all pages else: image = Image.open(uploaded_file.name) images_to_analyze = [image] # Process each image all_results = [] extracted_texts = [] for idx, image in enumerate(images_to_analyze, 1): try: client = OpenAI( base_url="https://openrouter.ai/api/v1", api_key=api_key ) image_base64 = image_to_base64(image) response = client.chat.completions.create( model="opengvlab/internvl3-14b:free", messages=[ {"role": "system", "content": "You are Dalton, an expert in understanding images that can analyze images and provide detailed descriptions."}, {"role": "user", "content": [ {"type": "text", "text": user_prompt}, {"type": "image_url", "image_url": { "url": f"data:image/png;base64,{image_base64}" }} ]} ], max_tokens=1024 ) result = response.choices[0].message.content extracted_texts.append(f"=== Page {idx} ===\n{result}\n") all_results.append(f"๐Ÿ“„ Page {idx} Result:\n{result}\n---\n") except Exception as e: raise gr.Error(f"Error analyzing page {idx}: {e}") # Generate summary if multiple pages final_output = "\n".join(all_results) if len(extracted_texts) > 1: summary = generate_summary("\n".join(extracted_texts), api_key) final_output += f"\n๐Ÿ“ Comprehensive Summary:\n{summary}" return final_output # --- GRADIO INTERFACE --- with gr.Blocks(title="DocSum - Document Summarizer", theme=gr.themes.Soft()) as demo: gr.Markdown("# ๐Ÿงพ DocSum") gr.Markdown("Document Summarizer Powered by VLM โ€ข Developed by [Koshur AI](https://koshurai.com)") with gr.Row(): api_key = gr.Textbox( label="๐Ÿ”‘ OpenRouter API Key", type="password", placeholder="Enter your OpenRouter API key" ) user_prompt = gr.Textbox( label="๐Ÿ“ Enter Your Prompt", value="Extract all content structurally", placeholder="What would you like to extract?" ) uploaded_file = gr.File( label="Upload Document (PDF/Image)", file_types=[".pdf", ".jpg", ".jpeg", ".png"] ) submit_btn = gr.Button("๐Ÿ” Analyze Document", variant="primary") # Replace Textbox with Markdown output output = gr.Markdown( label="Analysis Results", elem_classes=["markdown-output"] ) submit_btn.click( fn=analyze_document, inputs=[api_key, user_prompt, uploaded_file], outputs=output ) # Add custom CSS for the markdown output css = """ .markdown-output { padding: 20px; border-radius: 8px; background: #f9fafb; border: 1px solid #e5e7eb; max-height: 600px; overflow-y: auto; } .markdown-output h2 { color: #2563eb; margin-top: 1.5em; margin-bottom: 0.5em; } .markdown-output h3 { color: #3b82f6; margin-top: 1em; } """ demo.css = css if __name__ == "__main__": demo.launch()