pdftoxml / app.py
dindizz's picture
Update app.py
e550436 verified
import PyPDF2
import xml.etree.ElementTree as ET
import gradio as gr
import tempfile
import os
def pdf_to_xml(pdf_path, xml_path):
"""
Convert a PDF file to an XML file by extracting text from each page.
"""
with open(pdf_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
root = ET.Element("document")
for page_num, page in enumerate(pdf_reader.pages, start=1):
try:
page_text = page.extract_text()
except Exception as e:
# Log or handle extraction errors per page if needed.
page_text = ""
page_element = ET.SubElement(root, "page", number=str(page_num))
if page_text:
for line in page_text.split('\n'):
line_element = ET.SubElement(page_element, "line")
line_element.text = line
tree = ET.ElementTree(root)
tree.write(xml_path, encoding="utf-8", xml_declaration=True)
def pdf_to_xml_interface(pdf_file):
"""
Gradio interface function that accepts an uploaded PDF file and returns the converted XML file.
"""
# Handle case when no file is uploaded.
if pdf_file is None:
return "No file uploaded."
# Create a temporary file to store the output XML.
with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as tmp:
output_path = tmp.name
try:
pdf_to_xml(pdf_file.name, output_path)
except Exception as e:
# You can return a more detailed error message if desired.
return f"An error occurred during conversion: {e}"
return output_path
# Define the Gradio interface.
iface = gr.Interface(
fn=pdf_to_xml_interface,
inputs=gr.File(label="Upload PDF File"),
outputs=gr.File(label="Download XML File"),
title="PDF to XML Extractor",
description="Upload a PDF file to extract its text into a structured XML format."
)
if __name__ == "__main__":
# Launch the Gradio app.
iface.launch()