Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

leonarb commited on May 6

Commit

6ba101c

verified ·

1 Parent(s): 5201e8a

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -3

app.py CHANGED Viewed

@@ -36,11 +36,13 @@ def process_pdf_to_epub(pdf_file, title, author):
     for i in range(num_pages):
         page_num = i + 1
         try:
             # Render page to base64 image
             image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
             anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
             prompt = build_finetuning_prompt(anchor_text)
             messages = [
@@ -77,11 +79,17 @@ def process_pdf_to_epub(pdf_file, title, author):
             decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
         else:
             try:
-                decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
-                decoded = decoded_list[0].strip() if decoded_list else "[No output generated]"
             except Exception as decode_error:
                 decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
         # Create chapter
         chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
         chapter.content = f"<h1>Page {page_num}</h1><p>{decoded}</p>"
@@ -127,4 +135,3 @@ if __name__ == "__main__":
         debug=True,             # Optional: helpful if you're troubleshooting
         allowed_paths=["/tmp"]  # Optional: makes it explicit that Gradio can write here
     )

     for i in range(num_pages):
         page_num = i + 1
+        print(f"Processing page {page_num}...")  # Debugging line
         try:
             # Render page to base64 image
             image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
             anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
+            print(f"Anchor text for page {page_num}: {anchor_text}")  # Debugging line
             prompt = build_finetuning_prompt(anchor_text)
             messages = [
             decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
         else:
             try:
+                # Check if the tokens are empty
+                if not new_tokens:
+                    decoded = f"[No tokens generated for page {page_num}]"
+                else:
+                    decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
+                    decoded = decoded_list[0].strip() if decoded_list else "[No output generated]"
             except Exception as decode_error:
                 decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
+        print(f"Decoded content for page {page_num}: {decoded}")  # Debugging line
         # Create chapter
         chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
         chapter.content = f"<h1>Page {page_num}</h1><p>{decoded}</p>"
         debug=True,             # Optional: helpful if you're troubleshooting
         allowed_paths=["/tmp"]  # Optional: makes it explicit that Gradio can write here
     )