leonarb commited on
Commit
b3d319d
·
verified ·
1 Parent(s): 0b3222e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -32
app.py CHANGED
@@ -15,23 +15,23 @@ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
15
  cache_dir = "/tmp/huggingface_cache"
16
  os.environ["HF_HOME"] = cache_dir
17
  os.environ["TORCH_HOME"] = cache_dir
18
- os.environ["OLMOCR_LOG_PATH"] = "/tmp/olmocr-pipeline-debug.log"
19
  os.makedirs(cache_dir, exist_ok=True)
20
 
21
- # Patch logging path before olmocr import
22
  import logging
23
- original_file_handler = logging.FileHandler
24
- def safe_file_handler(filename, *args, **kwargs):
25
- if filename == "olmocr-pipeline-debug.log":
26
- filename = os.environ.get("OLMOCR_LOG_PATH", "/tmp/olmocr-pipeline-debug.log")
27
- return original_file_handler(filename, *args, **kwargs)
28
- logging.FileHandler = safe_file_handler
29
-
30
- # Import olmocr pipeline after setting log path
31
- from olmocr.pipeline import PDFToTextOCR
32
- from olmocr.data.renderpdf import render_pdf_to_base64png
33
  from olmocr.prompts import build_finetuning_prompt
34
  from olmocr.prompts.anchor import get_anchor_text
 
35
 
36
  # Load model and processor
37
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -41,9 +41,6 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
41
  ).eval().to(device)
42
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
43
 
44
- # Load OCR pipeline
45
- olmocr = PDFToTextOCR(model=model, processor=processor)
46
-
47
  def ocr_page(pdf_path, page_num):
48
  image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
49
  anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
@@ -77,46 +74,36 @@ def ocr_page(pdf_path, page_num):
77
 
78
  def create_epub_from_text(text, output_path, title, author, language, cover_image):
79
  book = epub.EpubBook()
80
-
81
- # Set metadata
82
  book.set_title(title)
83
  book.set_language(language)
84
  book.add_author(author)
85
 
86
- # Add cover image
87
  with open(cover_image, "rb") as cover_file:
88
  cover_data = cover_file.read()
89
  cover_item = epub.EpubItem(uid="cover", file_name="cover.jpg", media_type="image/jpeg", content=cover_data)
90
  book.add_item(cover_item)
91
 
92
- # Create a chapter for the content
93
  chapter = epub.EpubHtml(title="Content", file_name="content.xhtml", lang=language)
94
  chapter.set_content(f"<html><body><h1>{title}</h1><p>{text}</p></body></html>")
95
  book.add_item(chapter)
96
-
97
- # Define Table of Contents (TOC)
98
  book.toc = (epub.Link("content.xhtml", "Content", "content"),)
99
-
100
- # Add default NCX and OPF files
101
  book.add_item(epub.EpubNav())
102
-
103
- # Write the EPUB file
104
  epub.write_epub(output_path, book)
105
 
106
  def convert_pdf_to_epub(pdf_file, title, author, language):
107
  tmp_pdf_path = pdf_file.name
108
-
109
- # Read PDF to get cover
110
  reader = PdfReader(tmp_pdf_path)
111
- first_page = reader.pages[0]
112
  cover_path = "/tmp/cover.jpg"
113
  images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
114
  images[0].save(cover_path, "JPEG")
115
 
116
- # Run OCR
117
- ocr_text = olmocr.process(tmp_pdf_path)
 
 
 
 
118
 
119
- # Write EPUB
120
  epub_path = "/tmp/output.epub"
121
  create_epub_from_text(
122
  text=ocr_text,
@@ -126,7 +113,6 @@ def convert_pdf_to_epub(pdf_file, title, author, language):
126
  language=language,
127
  cover_image=cover_path
128
  )
129
-
130
  return epub_path, cover_path
131
 
132
  def interface_fn(pdf, title, author, language):
 
15
  cache_dir = "/tmp/huggingface_cache"
16
  os.environ["HF_HOME"] = cache_dir
17
  os.environ["TORCH_HOME"] = cache_dir
 
18
  os.makedirs(cache_dir, exist_ok=True)
19
 
20
+ # Patch logging to avoid permission errors
21
  import logging
22
+ from logging import FileHandler
23
+ class SafeFileHandler(FileHandler):
24
+ def __init__(self, filename, mode='a', encoding=None, delay=False, errors=None):
25
+ # Redirect all logs to tmp
26
+ safe_path = os.environ.get("OLMOCR_LOG_PATH", "/tmp/olmocr-pipeline-debug.log")
27
+ super().__init__(safe_path, mode, encoding, delay, errors)
28
+ logging.FileHandler = SafeFileHandler
29
+
30
+ # Now import olmocr
31
+ from olmocr.run_ocr import ocr_pdf_to_text
32
  from olmocr.prompts import build_finetuning_prompt
33
  from olmocr.prompts.anchor import get_anchor_text
34
+ from olmocr.data.renderpdf import render_pdf_to_base64png
35
 
36
  # Load model and processor
37
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
41
  ).eval().to(device)
42
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
43
 
 
 
 
44
  def ocr_page(pdf_path, page_num):
45
  image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
46
  anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
 
74
 
75
  def create_epub_from_text(text, output_path, title, author, language, cover_image):
76
  book = epub.EpubBook()
 
 
77
  book.set_title(title)
78
  book.set_language(language)
79
  book.add_author(author)
80
 
 
81
  with open(cover_image, "rb") as cover_file:
82
  cover_data = cover_file.read()
83
  cover_item = epub.EpubItem(uid="cover", file_name="cover.jpg", media_type="image/jpeg", content=cover_data)
84
  book.add_item(cover_item)
85
 
 
86
  chapter = epub.EpubHtml(title="Content", file_name="content.xhtml", lang=language)
87
  chapter.set_content(f"<html><body><h1>{title}</h1><p>{text}</p></body></html>")
88
  book.add_item(chapter)
 
 
89
  book.toc = (epub.Link("content.xhtml", "Content", "content"),)
 
 
90
  book.add_item(epub.EpubNav())
 
 
91
  epub.write_epub(output_path, book)
92
 
93
  def convert_pdf_to_epub(pdf_file, title, author, language):
94
  tmp_pdf_path = pdf_file.name
 
 
95
  reader = PdfReader(tmp_pdf_path)
 
96
  cover_path = "/tmp/cover.jpg"
97
  images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
98
  images[0].save(cover_path, "JPEG")
99
 
100
+ # Use official AllenAI OCR function
101
+ ocr_text = ocr_pdf_to_text(
102
+ pdf_path=tmp_pdf_path,
103
+ model=model,
104
+ processor=processor
105
+ )
106
 
 
107
  epub_path = "/tmp/output.epub"
108
  create_epub_from_text(
109
  text=ocr_text,
 
113
  language=language,
114
  cover_image=cover_path
115
  )
 
116
  return epub_path, cover_path
117
 
118
  def interface_fn(pdf, title, author, language):