dindizz commited on
Commit
e550436
·
verified ·
1 Parent(s): 1d4c478

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -8
app.py CHANGED
@@ -2,6 +2,7 @@ import PyPDF2
2
  import xml.etree.ElementTree as ET
3
  import gradio as gr
4
  import tempfile
 
5
 
6
  def pdf_to_xml(pdf_path, xml_path):
7
  """
@@ -12,7 +13,11 @@ def pdf_to_xml(pdf_path, xml_path):
12
  root = ET.Element("document")
13
 
14
  for page_num, page in enumerate(pdf_reader.pages, start=1):
15
- page_text = page.extract_text()
 
 
 
 
16
  page_element = ET.SubElement(root, "page", number=str(page_num))
17
 
18
  if page_text:
@@ -27,22 +32,31 @@ def pdf_to_xml_interface(pdf_file):
27
  """
28
  Gradio interface function that accepts an uploaded PDF file and returns the converted XML file.
29
  """
30
- # Create a temporary file to store the output XML
 
 
 
 
31
  with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as tmp:
32
  output_path = tmp.name
33
-
34
- # Convert the uploaded PDF (pdf_file.name gives the file path)
35
- pdf_to_xml(pdf_file.name, output_path)
 
 
 
 
36
  return output_path
37
 
38
- # Define the Gradio interface
39
  iface = gr.Interface(
40
  fn=pdf_to_xml_interface,
41
- inputs=gr.components.File(label="Upload PDF File"),
42
- outputs=gr.components.File(label="Download XML File"),
43
  title="PDF to XML Extractor",
44
  description="Upload a PDF file to extract its text into a structured XML format."
45
  )
46
 
47
  if __name__ == "__main__":
 
48
  iface.launch()
 
2
  import xml.etree.ElementTree as ET
3
  import gradio as gr
4
  import tempfile
5
+ import os
6
 
7
  def pdf_to_xml(pdf_path, xml_path):
8
  """
 
13
  root = ET.Element("document")
14
 
15
  for page_num, page in enumerate(pdf_reader.pages, start=1):
16
+ try:
17
+ page_text = page.extract_text()
18
+ except Exception as e:
19
+ # Log or handle extraction errors per page if needed.
20
+ page_text = ""
21
  page_element = ET.SubElement(root, "page", number=str(page_num))
22
 
23
  if page_text:
 
32
  """
33
  Gradio interface function that accepts an uploaded PDF file and returns the converted XML file.
34
  """
35
+ # Handle case when no file is uploaded.
36
+ if pdf_file is None:
37
+ return "No file uploaded."
38
+
39
+ # Create a temporary file to store the output XML.
40
  with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as tmp:
41
  output_path = tmp.name
42
+
43
+ try:
44
+ pdf_to_xml(pdf_file.name, output_path)
45
+ except Exception as e:
46
+ # You can return a more detailed error message if desired.
47
+ return f"An error occurred during conversion: {e}"
48
+
49
  return output_path
50
 
51
+ # Define the Gradio interface.
52
  iface = gr.Interface(
53
  fn=pdf_to_xml_interface,
54
+ inputs=gr.File(label="Upload PDF File"),
55
+ outputs=gr.File(label="Download XML File"),
56
  title="PDF to XML Extractor",
57
  description="Upload a PDF file to extract its text into a structured XML format."
58
  )
59
 
60
  if __name__ == "__main__":
61
+ # Launch the Gradio app.
62
  iface.launch()