Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

leonarb commited on 19 days ago

Commit

f01e8a4

verified ·

1 Parent(s): c986ff1

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -18

app.py CHANGED Viewed

@@ -1,19 +1,19 @@
 import os
-# Set a writable directory for Hugging Face's cache
-os.environ['HF_HOME'] = '/tmp/.cache/huggingface'
 import torch
 import base64
 from io import BytesIO
 from PIL import Image
 import gradio as gr
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts import build_finetuning_prompt
 from olmocr.prompts.anchor import get_anchor_text
 # Load processor and model
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
 model = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -23,19 +23,15 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
-def process_pdf(file, page=1):
-    # Save uploaded file to disk
     file_path = file.name
-    # Render the selected PDF page to base64 PNG
     image_base64 = render_pdf_to_base64png(file_path, page, target_longest_image_dim=1024)
     main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
-    # Extract document metadata and build the prompt
     anchor_text = get_anchor_text(file_path, page, pdf_engine="pdfreport", target_length=4000)
     prompt = build_finetuning_prompt(anchor_text)
-    # Construct chat message
     messages = [
         {
             "role": "user",
@@ -46,12 +42,10 @@ def process_pdf(file, page=1):
         }
     ]
-    # Tokenize inputs
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[text], images=[main_image], return_tensors="pt", padding=True)
     inputs = {k: v.to(device) for k, v in inputs.items()}
-    # Run model
     with torch.no_grad():
         output = model.generate(
             **inputs,
@@ -61,22 +55,45 @@ def process_pdf(file, page=1):
             do_sample=True,
         )
-    # Decode
     prompt_len = inputs["input_ids"].shape[1]
     new_tokens = output[:, prompt_len:]
-    decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
-    return decoded[0]
-# Gradio interface
 iface = gr.Interface(
     fn=process_pdf,
     inputs=[
         gr.File(label="Upload PDF"),
         gr.Number(value=1, label="Page Number"),
     ],
-    outputs="text",
-    title="olmOCR PDF Text Extractor",
-    description="Upload a PDF and select a page to extract text using the olmOCR model.",
 )
 if __name__ == "__main__":

 import os
 import torch
 import base64
 from io import BytesIO
 from PIL import Image
 import gradio as gr
+from ebooklib import epub
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts import build_finetuning_prompt
 from olmocr.prompts.anchor import get_anchor_text
+# Set a writable directory for Hugging Face's cache
+os.environ['HF_HOME'] = '/tmp/.cache/huggingface'
 # Load processor and model
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
 model = Qwen2VLForConditionalGeneration.from_pretrained(
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
+def process_pdf(file, page=1, title="Extracted Page", author="olmOCR", language="en"):
     file_path = file.name
     image_base64 = render_pdf_to_base64png(file_path, page, target_longest_image_dim=1024)
     main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
     anchor_text = get_anchor_text(file_path, page, pdf_engine="pdfreport", target_length=4000)
     prompt = build_finetuning_prompt(anchor_text)
     messages = [
         {
             "role": "user",
         }
     ]
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[text], images=[main_image], return_tensors="pt", padding=True)
     inputs = {k: v.to(device) for k, v in inputs.items()}
     with torch.no_grad():
         output = model.generate(
             **inputs,
             do_sample=True,
         )
     prompt_len = inputs["input_ids"].shape[1]
     new_tokens = output[:, prompt_len:]
+    decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
+    # Create EPUB
+    book = epub.EpubBook()
+    book.set_identifier("id123456")
+    book.set_title(title)
+    book.set_language(language)
+    book.add_author(author)
+    chapter = epub.EpubHtml(title=title, file_name="chap1.xhtml", lang=language)
+    chapter.content = f"<h1>{title}</h1><p>{decoded}</p>"
+    book.add_item(chapter)
+    book.toc = (epub.Link('chap1.xhtml', title, 'chap1'),)
+    book.add_item(epub.EpubNavi())
+    book.add_item(epub.EpubNCX())
+    book.spine = ['nav', chapter]
+    epub_path = f"/tmp/{title.replace(' ', '_')}_page_{page}.epub"
+    epub.write_epub(epub_path, book)
+    return epub_path
+# Gradio Interface
 iface = gr.Interface(
     fn=process_pdf,
     inputs=[
         gr.File(label="Upload PDF"),
         gr.Number(value=1, label="Page Number"),
+        gr.Textbox(value="Extracted Page", label="EPUB Title"),
+        gr.Textbox(value="olmOCR", label="Author"),
+        gr.Textbox(value="en", label="Language"),
     ],
+    outputs=gr.File(label="Download EPUB"),
+    title="olmOCR PDF to EPUB",
+    description="Extract text from a selected page of a PDF and download it as an EPUB file.",
+    allow_flagging="never"
 )
 if __name__ == "__main__":