Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

leonarb commited on May 8

Commit

d5f7d0d

verified ·

1 Parent(s): 84e3794

Fixes EPUB output

Browse files

Files changed (1) hide show

app.py +52 -128

app.py CHANGED Viewed

@@ -1,146 +1,70 @@
 import gradio as gr
-import torch
-import base64
-import fitz  # PyMuPDF
 from io import BytesIO
 from PIL import Image
-from pathlib import Path
-from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
-from olmocr.data.renderpdf import render_pdf_to_base64png
-from olmocr.prompts.anchor import get_anchor_text
 from ebooklib import epub
-# Load model and processor
-model = Qwen2VLForConditionalGeneration.from_pretrained(
-    "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
-).eval()
-processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
 def process_pdf_to_epub(pdf_file, title, author):
-    pdf_path = pdf_file.name
-    doc = fitz.open(pdf_path)
-    num_pages = len(doc)
-    # Create EPUB book
     book = epub.EpubBook()
     book.set_identifier("id123456")
-    book.set_title(title)
-    book.add_author(author)
     chapters = []
-    for i in range(num_pages):
-        page_num = i + 1
-        print(f"Processing page {page_num}...")
-        try:
-            # Render page to base64 image
-            image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
-            anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
-            print(f"Anchor text for page {page_num}: {anchor_text}")
-            # New prompt format
-            prompt = (
-                "Below is the image of one page of a document, as well as some raw textual content that was previously "
-                "extracted for it. Just return the plain text representation of this document as if you were reading it naturally.\n"
-                "Do not hallucinate.\n"
-                "RAW_TEXT_START\n"
-                f"{anchor_text}\n"
-                "RAW_TEXT_END"
-            )
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": prompt},
-                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
-                    ],
-                }
-            ]
-            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            image = Image.open(BytesIO(base64.b64decode(image_base64)))
-            inputs = processor(
-                text=[text],
-                images=[image],
-                padding=True,
-                return_tensors="pt",
-            )
-            inputs = {k: v.to(device) for k, v in inputs.items()}
-            output = model.generate(
-                **inputs,
-                temperature=0.8,
-                max_new_tokens=512,
-                num_return_sequences=1,
-                do_sample=True,
-            )
-            prompt_length = inputs["input_ids"].shape[1]
-            new_tokens = output[:, prompt_length:].detach().cpu()
-            decoded = "[No output generated]"
-            if new_tokens is not None and new_tokens.shape[1] > 0:
-                try:
-                    decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
-                    decoded = decoded_list[0].strip() if decoded_list else "[No output generated]"
-                except Exception as decode_error:
-                    decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
-            else:
-                decoded = "[Model returned no new tokens]"
-        except Exception as processing_error:
-            decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
-        print(f"Decoded content for page {page_num}: {decoded}")
-        # Create chapter
-        chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
-        chapter.content = f"<h1>Page {page_num}</h1><p>{decoded}</p>"
         book.add_item(chapter)
         chapters.append(chapter)
-        # Save cover image from page 1
-        if page_num == 1:
-            cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
-            cover_io = BytesIO()
-            cover_image.save(cover_io, format='PNG')
-            book.set_cover("cover.png", cover_io.getvalue())
-    # Assemble EPUB
-    book.toc = tuple(chapters)
     book.add_item(epub.EpubNcx())
     book.add_item(epub.EpubNav())
-    book.spine = ['nav'] + chapters
-    output_path = "/tmp/output.epub"
-    epub.write_epub(output_path, book)
-    return output_path
-# Gradio Interface
-iface = gr.Interface(
-    fn=process_pdf_to_epub,
-    inputs=[
-        gr.File(label="Upload PDF", file_types=[".pdf"]),
-        gr.Textbox(label="EPUB Title"),
-        gr.Textbox(label="Author(s)")
-    ],
-    outputs=gr.File(label="Download EPUB"),
-    title="PDF to EPUB Converter (with olmOCR)",
-    description="Uploads a PDF, extracts text from each page with vision + prompt, and builds an EPUB using the outputs. Sets the first page as cover.",
-    allow_flagging="never"
-)
 if __name__ == "__main__":
-    iface.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True,
-        debug=True,
-        allowed_paths=["/tmp"]
-    )

 import gradio as gr
+import tempfile
 from io import BytesIO
 from PIL import Image
 from ebooklib import epub
+from olmocr.model import process_pdf  # your forked olmocr model
 def process_pdf_to_epub(pdf_file, title, author):
+    # Run the OCR + LLM pipeline from olmocr
+    print("Starting PDF processing...")
+    page_results = process_pdf(pdf_file.name)
+    # Create the EPUB book
     book = epub.EpubBook()
     book.set_identifier("id123456")
+    book.set_title(title if title else "Untitled Document")
+    book.set_language("en")
+    if author:
+        book.add_author(author)
+    # Try to use the first page as cover
+    try:
+        with Image.open(pdf_file.name) as img:
+            img.convert("RGB").save("cover.jpg", "JPEG")
+            with open("cover.jpg", "rb") as f:
+                cover_data = f.read()
+                book.set_cover("cover.jpg", cover_data)
+    except Exception as e:
+        print("Could not generate cover:", e)
+    # Add chapters from pages
     chapters = []
+    for i, page in enumerate(page_results):
+        text = page.get("decoded", {}).get("natural_text", "")
+        if not text.strip():
+            continue
+        chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang="en")
+        chapter.content = f"<h1>Page {i+1}</h1><p>{text.replace('\n', '<br/>')}</p>"
         book.add_item(chapter)
         chapters.append(chapter)
+    # Define spine and table of contents
+    book.toc = chapters
+    book.spine = ["nav"] + chapters
     book.add_item(epub.EpubNcx())
     book.add_item(epub.EpubNav())
+    # Write to in-memory buffer
+    output_buffer = BytesIO()
+    epub.write_epub(output_buffer, book)
+    output_buffer.seek(0)
+    return output_buffer
+with gr.Blocks() as demo:
+    gr.Markdown("# PDF to EPUB Converter\nPowered by `olmOCR`")
+    with gr.Row():
+        pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+    title = gr.Textbox(label="EPUB Title", placeholder="Optional title")
+    author = gr.Textbox(label="Author", placeholder="Optional author name")
+    convert_button = gr.Button("Convert to EPUB")
+    epub_output = gr.File(label="Download EPUB", file_types=[".epub"])
+    convert_button.click(
+        fn=process_pdf_to_epub,
+        inputs=[pdf_input, title, author],
+        outputs=epub_output
+    )
 if __name__ == "__main__":
+    demo.launch()