leonarb commited on
Commit
6a0411c
·
verified ·
1 Parent(s): a32e7b0

Reset all fixes

Browse files
Files changed (1) hide show
  1. app.py +49 -49
app.py CHANGED
@@ -1,71 +1,71 @@
1
- import gradio as gr
2
  import tempfile
3
- from io import BytesIO
4
- from PIL import Image
 
5
  from ebooklib import epub
6
- from olmocr.model import process_pdf # your forked olmocr model
7
 
8
- def process_pdf_to_epub(pdf_file, title, author):
9
- # Run the OCR + LLM pipeline from olmocr
10
- print("Starting PDF processing...")
11
- page_results = process_pdf(pdf_file.name)
12
 
13
- # Create the EPUB book
14
  book = epub.EpubBook()
15
  book.set_identifier("id123456")
16
- book.set_title(title if title else "Untitled Document")
17
  book.set_language("en")
18
- if author:
19
- book.add_author(author)
20
-
21
- # Try to use the first page as cover
22
- try:
23
- with Image.open(pdf_file.name) as img:
24
- img.convert("RGB").save("cover.jpg", "JPEG")
25
- with open("cover.jpg", "rb") as f:
26
- cover_data = f.read()
27
- book.set_cover("cover.jpg", cover_data)
28
- except Exception as e:
29
- print("Could not generate cover:", e)
30
 
31
- # Add chapters from pages
32
  chapters = []
33
- for i, page in enumerate(page_results):
34
- text = page.get("decoded", {}).get("natural_text", "")
35
- if not text.strip():
 
36
  continue
37
- safe_text = text.replace("\n", "<br/>")
38
  chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang="en")
 
 
39
  chapter.content = f"<h1>Page {i+1}</h1><p>{safe_text}</p>"
40
  book.add_item(chapter)
41
  chapters.append(chapter)
42
 
43
- # Define spine and table of contents
44
- book.toc = chapters
45
- book.spine = ["nav"] + chapters
 
 
 
46
  book.add_item(epub.EpubNcx())
47
  book.add_item(epub.EpubNav())
 
48
 
49
- # Write to in-memory buffer
50
- output_buffer = BytesIO()
51
- epub.write_epub(output_buffer, book)
52
- output_buffer.seek(0)
53
- return output_buffer
54
 
55
- with gr.Blocks() as demo:
56
- gr.Markdown("# PDF to EPUB Converter\nPowered by `olmOCR`")
57
- with gr.Row():
58
- pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
59
- title = gr.Textbox(label="EPUB Title", placeholder="Optional title")
60
- author = gr.Textbox(label="Author", placeholder="Optional author name")
61
- convert_button = gr.Button("Convert to EPUB")
62
- epub_output = gr.File(label="Download EPUB", file_types=[".epub"])
63
 
64
- convert_button.click(
65
- fn=process_pdf_to_epub,
66
- inputs=[pdf_input, title, author],
67
- outputs=epub_output
68
- )
 
 
 
 
69
 
70
  if __name__ == "__main__":
71
- demo.launch()
 
 
 
 
 
 
 
1
+ import os
2
  import tempfile
3
+ from pathlib import Path
4
+
5
+ import gradio as gr
6
  from ebooklib import epub
7
+ from olmocr import process_pdf # your forked olmocr model
8
 
9
+ def process_pdf_to_epub(pdf_path, title="Untitled", author="Unknown"):
10
+ print(f"Processing PDF: {pdf_path}")
11
+ output_dir = tempfile.mkdtemp()
12
+ results = process_pdf(pdf_path, output_dir)
13
 
 
14
  book = epub.EpubBook()
15
  book.set_identifier("id123456")
16
+ book.set_title(title)
17
  book.set_language("en")
18
+ book.add_author(author)
 
 
 
 
 
 
 
 
 
 
 
19
 
 
20
  chapters = []
21
+
22
+ for i, result in enumerate(results):
23
+ text = result.get("decoded_content", {}).get("natural_text", "")
24
+ if not text:
25
  continue
26
+
27
  chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang="en")
28
+ # Avoid backslash in f-string expression
29
+ safe_text = text.replace("\n", "<br/>")
30
  chapter.content = f"<h1>Page {i+1}</h1><p>{safe_text}</p>"
31
  book.add_item(chapter)
32
  chapters.append(chapter)
33
 
34
+ print(f"Processed page {i+1}")
35
+
36
+ if not chapters:
37
+ raise ValueError("No content extracted from PDF.")
38
+
39
+ book.toc = tuple(chapters)
40
  book.add_item(epub.EpubNcx())
41
  book.add_item(epub.EpubNav())
42
+ book.spine = ["nav"] + chapters
43
 
44
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
45
+ epub.write_epub(tmp.name, book)
46
+ print(f"EPUB written to {tmp.name}")
47
+ return tmp.name
 
48
 
49
+ # Gradio UI
50
+ title_input = gr.Textbox(label="EPUB Title", value="Untitled")
51
+ author_input = gr.Textbox(label="Author", value="Unknown")
52
+ file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
 
 
 
 
53
 
54
+ output_file = gr.File(label="Download EPUB")
55
+
56
+ iface = gr.Interface(
57
+ fn=process_pdf_to_epub,
58
+ inputs=[file_input, title_input, author_input],
59
+ outputs=output_file,
60
+ title="PDF to EPUB Converter with olmOCR",
61
+ description="Upload a PDF to convert it into an EPUB. First page is used as the cover."
62
+ )
63
 
64
  if __name__ == "__main__":
65
+ iface.launch(
66
+ server_name="0.0.0.0",
67
+ server_port=7860,
68
+ share=True,
69
+ debug=True,
70
+ allowed_paths=["/tmp"]
71
+ )