leonarb commited on
Commit
822eba7
·
verified ·
1 Parent(s): 59ff001

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -10
app.py CHANGED
@@ -25,7 +25,6 @@ def process_pdf_to_epub(pdf_file, title, author):
25
  doc = fitz.open(pdf_path)
26
  num_pages = len(doc)
27
 
28
- # Create EPUB book
29
  book = epub.EpubBook()
30
  book.set_identifier("id123456")
31
  book.set_title(title)
@@ -38,12 +37,9 @@ def process_pdf_to_epub(pdf_file, title, author):
38
  print(f"Processing page {page_num}...")
39
 
40
  try:
41
- # Render page to base64 image
42
  image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
43
  anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
44
- print(f"Anchor text for page {page_num}: {anchor_text}")
45
 
46
- # New prompt format
47
  prompt = (
48
  "Below is the image of one page of a document, as well as some raw textual content that was previously "
49
  "extracted for it. Just return the plain text representation of this document as if you were reading it naturally.\n"
@@ -99,28 +95,26 @@ def process_pdf_to_epub(pdf_file, title, author):
99
 
100
  print(f"Decoded content for page {page_num}: {decoded}")
101
 
102
- # Create chapter
103
  chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
104
  chapter.content = f"<h1>Page {page_num}</h1><p>{decoded}</p>"
105
  book.add_item(chapter)
106
  chapters.append(chapter)
107
 
108
- # Save cover image from page 1
109
  if page_num == 1:
110
  cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
111
  cover_io = BytesIO()
112
  cover_image.save(cover_io, format='PNG')
113
  book.set_cover("cover.png", cover_io.getvalue())
114
 
115
- # Assemble EPUB
116
  book.toc = tuple(chapters)
117
  book.add_item(epub.EpubNcx())
118
  book.add_item(epub.EpubNav())
119
  book.spine = ['nav'] + chapters
120
 
121
- output_path = "/tmp/output.epub"
122
- epub.write_epub(output_path, book)
123
- return output_path
 
124
 
125
  # Gradio Interface
126
  iface = gr.Interface(
 
25
  doc = fitz.open(pdf_path)
26
  num_pages = len(doc)
27
 
 
28
  book = epub.EpubBook()
29
  book.set_identifier("id123456")
30
  book.set_title(title)
 
37
  print(f"Processing page {page_num}...")
38
 
39
  try:
 
40
  image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
41
  anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
 
42
 
 
43
  prompt = (
44
  "Below is the image of one page of a document, as well as some raw textual content that was previously "
45
  "extracted for it. Just return the plain text representation of this document as if you were reading it naturally.\n"
 
95
 
96
  print(f"Decoded content for page {page_num}: {decoded}")
97
 
 
98
  chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
99
  chapter.content = f"<h1>Page {page_num}</h1><p>{decoded}</p>"
100
  book.add_item(chapter)
101
  chapters.append(chapter)
102
 
 
103
  if page_num == 1:
104
  cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
105
  cover_io = BytesIO()
106
  cover_image.save(cover_io, format='PNG')
107
  book.set_cover("cover.png", cover_io.getvalue())
108
 
 
109
  book.toc = tuple(chapters)
110
  book.add_item(epub.EpubNcx())
111
  book.add_item(epub.EpubNav())
112
  book.spine = ['nav'] + chapters
113
 
114
+ # SAFELY write to a temp file in /tmp
115
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
116
+ epub.write_epub(tmp.name, book)
117
+ return tmp.name
118
 
119
  # Gradio Interface
120
  iface = gr.Interface(