Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -15,23 +15,23 @@ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
|
|
15 |
cache_dir = "/tmp/huggingface_cache"
|
16 |
os.environ["HF_HOME"] = cache_dir
|
17 |
os.environ["TORCH_HOME"] = cache_dir
|
18 |
-
os.environ["OLMOCR_LOG_PATH"] = "/tmp/olmocr-pipeline-debug.log"
|
19 |
os.makedirs(cache_dir, exist_ok=True)
|
20 |
|
21 |
-
# Patch logging
|
22 |
import logging
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
from olmocr.
|
33 |
from olmocr.prompts import build_finetuning_prompt
|
34 |
from olmocr.prompts.anchor import get_anchor_text
|
|
|
35 |
|
36 |
# Load model and processor
|
37 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
@@ -41,9 +41,6 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
41 |
).eval().to(device)
|
42 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
|
43 |
|
44 |
-
# Load OCR pipeline
|
45 |
-
olmocr = PDFToTextOCR(model=model, processor=processor)
|
46 |
-
|
47 |
def ocr_page(pdf_path, page_num):
|
48 |
image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
|
49 |
anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
|
@@ -77,46 +74,36 @@ def ocr_page(pdf_path, page_num):
|
|
77 |
|
78 |
def create_epub_from_text(text, output_path, title, author, language, cover_image):
|
79 |
book = epub.EpubBook()
|
80 |
-
|
81 |
-
# Set metadata
|
82 |
book.set_title(title)
|
83 |
book.set_language(language)
|
84 |
book.add_author(author)
|
85 |
|
86 |
-
# Add cover image
|
87 |
with open(cover_image, "rb") as cover_file:
|
88 |
cover_data = cover_file.read()
|
89 |
cover_item = epub.EpubItem(uid="cover", file_name="cover.jpg", media_type="image/jpeg", content=cover_data)
|
90 |
book.add_item(cover_item)
|
91 |
|
92 |
-
# Create a chapter for the content
|
93 |
chapter = epub.EpubHtml(title="Content", file_name="content.xhtml", lang=language)
|
94 |
chapter.set_content(f"<html><body><h1>{title}</h1><p>{text}</p></body></html>")
|
95 |
book.add_item(chapter)
|
96 |
-
|
97 |
-
# Define Table of Contents (TOC)
|
98 |
book.toc = (epub.Link("content.xhtml", "Content", "content"),)
|
99 |
-
|
100 |
-
# Add default NCX and OPF files
|
101 |
book.add_item(epub.EpubNav())
|
102 |
-
|
103 |
-
# Write the EPUB file
|
104 |
epub.write_epub(output_path, book)
|
105 |
|
106 |
def convert_pdf_to_epub(pdf_file, title, author, language):
|
107 |
tmp_pdf_path = pdf_file.name
|
108 |
-
|
109 |
-
# Read PDF to get cover
|
110 |
reader = PdfReader(tmp_pdf_path)
|
111 |
-
first_page = reader.pages[0]
|
112 |
cover_path = "/tmp/cover.jpg"
|
113 |
images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
|
114 |
images[0].save(cover_path, "JPEG")
|
115 |
|
116 |
-
#
|
117 |
-
ocr_text =
|
|
|
|
|
|
|
|
|
118 |
|
119 |
-
# Write EPUB
|
120 |
epub_path = "/tmp/output.epub"
|
121 |
create_epub_from_text(
|
122 |
text=ocr_text,
|
@@ -126,7 +113,6 @@ def convert_pdf_to_epub(pdf_file, title, author, language):
|
|
126 |
language=language,
|
127 |
cover_image=cover_path
|
128 |
)
|
129 |
-
|
130 |
return epub_path, cover_path
|
131 |
|
132 |
def interface_fn(pdf, title, author, language):
|
|
|
15 |
cache_dir = "/tmp/huggingface_cache"
|
16 |
os.environ["HF_HOME"] = cache_dir
|
17 |
os.environ["TORCH_HOME"] = cache_dir
|
|
|
18 |
os.makedirs(cache_dir, exist_ok=True)
|
19 |
|
20 |
+
# Patch logging to avoid permission errors
|
21 |
import logging
|
22 |
+
from logging import FileHandler
|
23 |
+
class SafeFileHandler(FileHandler):
|
24 |
+
def __init__(self, filename, mode='a', encoding=None, delay=False, errors=None):
|
25 |
+
# Redirect all logs to tmp
|
26 |
+
safe_path = os.environ.get("OLMOCR_LOG_PATH", "/tmp/olmocr-pipeline-debug.log")
|
27 |
+
super().__init__(safe_path, mode, encoding, delay, errors)
|
28 |
+
logging.FileHandler = SafeFileHandler
|
29 |
+
|
30 |
+
# Now import olmocr
|
31 |
+
from olmocr.run_ocr import ocr_pdf_to_text
|
32 |
from olmocr.prompts import build_finetuning_prompt
|
33 |
from olmocr.prompts.anchor import get_anchor_text
|
34 |
+
from olmocr.data.renderpdf import render_pdf_to_base64png
|
35 |
|
36 |
# Load model and processor
|
37 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
41 |
).eval().to(device)
|
42 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
|
43 |
|
|
|
|
|
|
|
44 |
def ocr_page(pdf_path, page_num):
|
45 |
image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
|
46 |
anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
|
|
|
74 |
|
75 |
def create_epub_from_text(text, output_path, title, author, language, cover_image):
|
76 |
book = epub.EpubBook()
|
|
|
|
|
77 |
book.set_title(title)
|
78 |
book.set_language(language)
|
79 |
book.add_author(author)
|
80 |
|
|
|
81 |
with open(cover_image, "rb") as cover_file:
|
82 |
cover_data = cover_file.read()
|
83 |
cover_item = epub.EpubItem(uid="cover", file_name="cover.jpg", media_type="image/jpeg", content=cover_data)
|
84 |
book.add_item(cover_item)
|
85 |
|
|
|
86 |
chapter = epub.EpubHtml(title="Content", file_name="content.xhtml", lang=language)
|
87 |
chapter.set_content(f"<html><body><h1>{title}</h1><p>{text}</p></body></html>")
|
88 |
book.add_item(chapter)
|
|
|
|
|
89 |
book.toc = (epub.Link("content.xhtml", "Content", "content"),)
|
|
|
|
|
90 |
book.add_item(epub.EpubNav())
|
|
|
|
|
91 |
epub.write_epub(output_path, book)
|
92 |
|
93 |
def convert_pdf_to_epub(pdf_file, title, author, language):
|
94 |
tmp_pdf_path = pdf_file.name
|
|
|
|
|
95 |
reader = PdfReader(tmp_pdf_path)
|
|
|
96 |
cover_path = "/tmp/cover.jpg"
|
97 |
images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
|
98 |
images[0].save(cover_path, "JPEG")
|
99 |
|
100 |
+
# Use official AllenAI OCR function
|
101 |
+
ocr_text = ocr_pdf_to_text(
|
102 |
+
pdf_path=tmp_pdf_path,
|
103 |
+
model=model,
|
104 |
+
processor=processor
|
105 |
+
)
|
106 |
|
|
|
107 |
epub_path = "/tmp/output.epub"
|
108 |
create_epub_from_text(
|
109 |
text=ocr_text,
|
|
|
113 |
language=language,
|
114 |
cover_image=cover_path
|
115 |
)
|
|
|
116 |
return epub_path, cover_path
|
117 |
|
118 |
def interface_fn(pdf, title, author, language):
|