import gradio as gr from transformers import AutoProcessor, AutoModelForVision2Seq from pdf2image import convert_from_path import base64 import io import spaces from PIL import Image # Load the OCR model and processor from Hugging Face try: processor = AutoProcessor.from_pretrained("allenai/olmOCR-7B-0225-preview") model = AutoModelForVision2Seq.from_pretrained("allenai/olmOCR-7B-0225-preview") except ImportError as e: processor = None model = None print(f"Error loading model: {str(e)}. Please ensure PyTorch is installed.") except ValueError as e: processor = None model = None print(f"Error with model configuration: {str(e)}") @spaces.GPU def process_pdf(pdf_file): """ Process the uploaded PDF file, extract text from each page, and generate HTML to display each page's image and text with copy buttons. """ if processor is None or model is None: return "

Error: Model could not be loaded. Check environment setup (PyTorch may be missing) or model compatibility.

" # Check if a PDF file was uploaded if pdf_file is None: return "

Please upload a PDF file.

" # Convert PDF to images try: pages = convert_from_path(pdf_file.name) except Exception as e: return f"

Error converting PDF to images: {str(e)}

" # Start building the HTML output html = '

' # Process each page for i, page in enumerate(pages): # Convert the page image to base64 for embedding in HTML buffered = io.BytesIO() page.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode() img_data = f"data:image/png;base64,{img_str}" # Extract text from the page using the OCR model try: inputs = processor(text="Extract the text from this image.", images=page, return_tensors="pt") outputs = model.generate(**inputs) text = processor.decode(outputs[0], skip_special_tokens=True) except Exception as e: text = f"Error extracting text: {str(e)}" # Generate HTML for this page's section textarea_id = f"text{i+1}" html += f'''

Page {i+1}

''' # Close the pages div and add JavaScript for copy functionality html += '

' html += ''' ''' return html # Define the Gradio interface with gr.Blocks(title="PDF Text Extractor") as demo: gr.Markdown("# PDF Text Extractor") gr.Markdown("Upload a PDF file and click 'Extract Text' to see each page's image and extracted text.") with gr.Row(): pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) submit_btn = gr.Button("Extract Text") output_html = gr.HTML() submit_btn.click(fn=process_pdf, inputs=pdf_input, outputs=output_html) # Launch the interface demo.launch()