Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

leonarb commited on 19 days ago

Commit

0225b8c

verified ·

1 Parent(s): f99a1ea

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -15

app.py CHANGED Viewed

@@ -11,7 +11,9 @@ from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts import build_finetuning_prompt
 from olmocr.prompts.anchor import get_anchor_text
-# Set a writable directory for Hugging Face's cache
 os.environ['HF_HOME'] = '/tmp/.cache/huggingface'
 # Load processor and model
@@ -23,13 +25,13 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
-def process_pdf(file, page=1, title="Extracted Page", author="olmOCR", language="en"):
-    file_path = file.name
-    image_base64 = render_pdf_to_base64png(file_path, page, target_longest_image_dim=1024)
-    main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
-    anchor_text = get_anchor_text(file_path, page, pdf_engine="pdfreport", target_length=4000)
     prompt = build_finetuning_prompt(anchor_text)
     messages = [
@@ -43,7 +45,7 @@ def process_pdf(file, page=1, title="Extracted Page", author="olmOCR", language=
     ]
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor(text=[text], images=[main_image], return_tensors="pt", padding=True)
     inputs = {k: v.to(device) for k, v in inputs.items()}
     with torch.no_grad():
@@ -58,16 +60,37 @@ def process_pdf(file, page=1, title="Extracted Page", author="olmOCR", language=
     prompt_len = inputs["input_ids"].shape[1]
     new_tokens = output[:, prompt_len:]
     decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
-    # Create EPUB
     book = epub.EpubBook()
     book.set_identifier("id123456")
     book.set_title(title)
     book.set_language(language)
     book.add_author(author)
     chapter = epub.EpubHtml(title=title, file_name="chap1.xhtml", lang=language)
-    chapter.content = f"<h1>{title}</h1><p>{decoded}</p>"
     book.add_item(chapter)
     book.toc = (epub.Link('chap1.xhtml', title, 'chap1'),)
@@ -75,7 +98,7 @@ def process_pdf(file, page=1, title="Extracted Page", author="olmOCR", language=
     book.add_item(epub.EpubNCX())
     book.spine = ['nav', chapter]
-    epub_path = f"/tmp/{title.replace(' ', '_')}_page_{page}.epub"
     epub.write_epub(epub_path, book)
     return epub_path
@@ -85,14 +108,13 @@ iface = gr.Interface(
     fn=process_pdf,
     inputs=[
         gr.File(label="Upload PDF"),
-        gr.Number(value=1, label="Page Number"),
-        gr.Textbox(value="Extracted Page", label="EPUB Title"),
         gr.Textbox(value="olmOCR", label="Author"),
         gr.Textbox(value="en", label="Language"),
     ],
     outputs=gr.File(label="Download EPUB"),
-    title="olmOCR PDF to EPUB",
-    description="Extract text from a selected page of a PDF and download it as an EPUB file.",
     allow_flagging="never"
 )

 from olmocr.prompts import build_finetuning_prompt
 from olmocr.prompts.anchor import get_anchor_text
+from PyPDF2 import PdfReader
+# Set a writable cache directory for HF
 os.environ['HF_HOME'] = '/tmp/.cache/huggingface'
 # Load processor and model
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
+def extract_text_from_page(pdf_path, page_num):
+    # Render image
+    image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
+    image = Image.open(BytesIO(base64.b64decode(image_base64)))
+    # Prompt and input
+    anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
     prompt = build_finetuning_prompt(anchor_text)
     messages = [
     ]
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)
     inputs = {k: v.to(device) for k, v in inputs.items()}
     with torch.no_grad():
     prompt_len = inputs["input_ids"].shape[1]
     new_tokens = output[:, prompt_len:]
     decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
+    return decoded, image_base64 if page_num == 1 else None
+def process_pdf(file, title="Extracted PDF", author="olmOCR", language="en"):
+    file_path = file.name
+    reader = PdfReader(file_path)
+    num_pages = len(reader.pages)
+    all_text = []
+    cover_image_data = None
+    for page in range(1, num_pages + 1):
+        text, cover_image = extract_text_from_page(file_path, page)
+        all_text.append(f"<h2>Page {page}</h2><p>{text}</p>")
+        if cover_image and not cover_image_data:
+            cover_image_data = cover_image  # base64
+    # Build EPUB
     book = epub.EpubBook()
     book.set_identifier("id123456")
     book.set_title(title)
     book.set_language(language)
     book.add_author(author)
+    # Add cover image
+    if cover_image_data:
+        cover_bytes = base64.b64decode(cover_image_data)
+        book.set_cover("cover.jpg", cover_bytes)
+    # Create chapter with all text
     chapter = epub.EpubHtml(title=title, file_name="chap1.xhtml", lang=language)
+    chapter.content = f"<h1>{title}</h1>{''.join(all_text)}"
     book.add_item(chapter)
     book.toc = (epub.Link('chap1.xhtml', title, 'chap1'),)
     book.add_item(epub.EpubNCX())
     book.spine = ['nav', chapter]
+    epub_path = f"/tmp/{title.replace(' ', '_')}.epub"
     epub.write_epub(epub_path, book)
     return epub_path
     fn=process_pdf,
     inputs=[
         gr.File(label="Upload PDF"),
+        gr.Textbox(value="Extracted PDF", label="EPUB Title"),
         gr.Textbox(value="olmOCR", label="Author"),
         gr.Textbox(value="en", label="Language"),
     ],
     outputs=gr.File(label="Download EPUB"),
+    title="olmOCR PDF to EPUB (Full PDF + Cover Image)",
+    description="Extract text from ALL pages of a PDF and generate an EPUB with the first page as cover.",
     allow_flagging="never"
 )