Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

leonarb commited on May 8

Commit

c7e3ff4

verified ·

1 Parent(s): 3658a99

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -11

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts.anchor import get_anchor_text
 from ebooklib import epub
 # Load model and processor
 model = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -31,7 +32,7 @@ def process_pdf_to_epub(pdf_file, title, author):
     book.set_title(title)
     book.add_author(author)
-    chapters = []
     for i in range(num_pages):
         page_num = i + 1
@@ -73,7 +74,7 @@ def process_pdf_to_epub(pdf_file, title, author):
             output = model.generate(
                 **inputs,
                 temperature=0.8,
-                max_new_tokens=512,
                 num_return_sequences=1,
                 do_sample=True,
             )
@@ -85,7 +86,12 @@ def process_pdf_to_epub(pdf_file, title, author):
             if new_tokens is not None and new_tokens.shape[1] > 0:
                 try:
                     decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
-                    decoded = decoded_list[0].strip() if decoded_list else "[No output generated]"
                 except Exception as decode_error:
                     decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
             else:
@@ -95,11 +101,7 @@ def process_pdf_to_epub(pdf_file, title, author):
             decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
         print(f"Decoded content for page {page_num}: {decoded}")
-        chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
-        chapter.content = f"<h1>Page {page_num}</h1><p>{decoded}</p>"
-        book.add_item(chapter)
-        chapters.append(chapter)
         if page_num == 1:
             cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
@@ -107,12 +109,14 @@ def process_pdf_to_epub(pdf_file, title, author):
             cover_image.save(cover_io, format='PNG')
             book.set_cover("cover.png", cover_io.getvalue())
-    book.toc = tuple(chapters)
     book.add_item(epub.EpubNcx())
     book.add_item(epub.EpubNav())
-    book.spine = ['nav'] + chapters
-    # ✅ SAFELY write to a temp file in /tmp
     with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
         epub.write_epub(tmp.name, book)
         return tmp.name

 from olmocr.prompts.anchor import get_anchor_text
 from ebooklib import epub
+import json
 # Load model and processor
 model = Qwen2VLForConditionalGeneration.from_pretrained(
     book.set_title(title)
     book.add_author(author)
+    all_text = ""
     for i in range(num_pages):
         page_num = i + 1
             output = model.generate(
                 **inputs,
                 temperature=0.8,
+                max_new_tokens=5096,
                 num_return_sequences=1,
                 do_sample=True,
             )
             if new_tokens is not None and new_tokens.shape[1] > 0:
                 try:
                     decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
+                    raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
+                    try:
+                        parsed = json.loads(raw_output)
+                        decoded = parsed.get("natural_text", raw_output)
+                    except json.JSONDecodeError:
+                        decoded = raw_output
                 except Exception as decode_error:
                     decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
             else:
             decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
         print(f"Decoded content for page {page_num}: {decoded}")
+        all_text += f"<h2>Page {page_num}</h2>" + "".join(f"<p>{p.strip()}</p>" for p in decoded.split("\n\n") if p.strip())
         if page_num == 1:
             cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
             cover_image.save(cover_io, format='PNG')
             book.set_cover("cover.png", cover_io.getvalue())
+    single_chapter = epub.EpubHtml(title="Full Document", file_name="full_document.xhtml", lang="en")
+    single_chapter.content = f"<h1>{title}</h1>{all_text}"
+    book.add_item(single_chapter)
+    book.toc = (single_chapter,)
+    book.spine = ['nav', single_chapter]
     book.add_item(epub.EpubNcx())
     book.add_item(epub.EpubNav())
     with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
         epub.write_epub(tmp.name, book)
         return tmp.name