dindizz commited on
Commit
bf4ed2a
·
verified ·
1 Parent(s): 2fc939b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -0
app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import xml.etree.ElementTree as ET
3
+ import gradio as gr
4
+ import tempfile
5
+
6
+ def pdf_to_xml(pdf_path, xml_path):
7
+ """
8
+ Convert a PDF file to an XML file by extracting text from each page.
9
+ """
10
+ with open(pdf_path, 'rb') as pdf_file:
11
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
12
+ root = ET.Element("document")
13
+
14
+ for page_num, page in enumerate(pdf_reader.pages, start=1):
15
+ page_text = page.extract_text()
16
+ page_element = ET.SubElement(root, "page", number=str(page_num))
17
+
18
+ if page_text:
19
+ for line in page_text.split('\n'):
20
+ line_element = ET.SubElement(page_element, "line")
21
+ line_element.text = line
22
+
23
+ tree = ET.ElementTree(root)
24
+ tree.write(xml_path, encoding="utf-8", xml_declaration=True)
25
+
26
+ def pdf_to_xml_interface(pdf_file):
27
+ """
28
+ Gradio interface function that accepts an uploaded PDF file and returns the converted XML file.
29
+ """
30
+ # Create a temporary file to store the output XML
31
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as tmp:
32
+ output_path = tmp.name
33
+
34
+ # Convert the uploaded PDF (pdf_file.name gives the file path)
35
+ pdf_to_xml(pdf_file.name, output_path)
36
+ return output_path
37
+
38
+ # Define the Gradio interface
39
+ iface = gr.Interface(
40
+ fn=pdf_to_xml_interface,
41
+ inputs=gr.components.File(label="Upload PDF File"),
42
+ outputs=gr.components.File(label="Download XML File"),
43
+ title="PDF to XML Extractor",
44
+ description="Upload a PDF file to extract its text into a structured XML format."
45
+ )
46
+
47
+ if __name__ == "__main__":
48
+ iface.launch()